In [None]:
# Run only once if you need to install the python-Levenshtein package

# Install a pip package in the current Jupyter kernel
import sys
!{sys.executable} -m pip install python-Levenshtein

# Retrieve data using DOI

NOTE: this script is under development and is not intended for production use anywhere. It's been made public in case you want to borrow or hack any of the code for your own purposes. So no promises about anything!

It's got a ton of functions that are defined, but not used. I've stashed them here because I might use some of them later.

Currently the script only uses the CrossRef API, but potentially could use others like DataCite

## Configuration

This section imports libraries, sets default values, and defines functions

In [7]:
# retrieve_doi_data, a script for downloading data from CrossRef and preparing it to upload to Wikidata
version = '0.1'
created = '2021-11-06'

# (c) 2021 Vanderbilt University. This program is released under a GNU General Public License v3.0 http://www.gnu.org/licenses/gpl-3.0
# Author: Steve Baskauf

# Note: this script requires the VanderBot script to upload the generated data to Wikidata
# For details, see https://github.com/HeardLibrary/linked-data/tree/master/vanderbot

import requests   # best library to manage HTTP transactions
import json
import re
from time import sleep
import csv
import sys
#import math
from fuzzywuzzy import fuzz # fuzzy logic matching

import xml.etree.ElementTree as et # library to traverse XML tree
import urllib
import datetime
import pprint
pp = pprint.PrettyPrinter(indent=4)

accept_media_type = 'application/json'
endpoint = 'https://query.wikidata.org/sparql'
user_agent_header = 'retrieve_doi_data/0.1 (https://github.com/HeardLibrary/linked-data/tree/master/publications/crossref; mailto:steve.baskauf@vanderbilt.edu)'
sparql_sleep = 0.1
default_language = 'en'

# NCBI identification requirements:
# tool name and email address should be sent with all requests
# see https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch
email_address = 'steve.baskauf@vanderbilt.edu' # put your email address here
tool_name = 'retrieve_doi_data0.1' # give your application a name here, no spaces

# Option to log to a file instead of the console
log_path = '' # path to log file, default to none
log_object = sys.stdout # log output defaults to the console screen
file_path = '' # path to output directory, default to current working directory

opts = [opt for opt in sys.argv[1:] if opt.startswith('-')]
args = [arg for arg in sys.argv[1:] if not arg.startswith('-')]

if '--log' in opts: # set output to specified log file or path including file name
    log_path = args[opts.index('--log')]
    log_object = open(log_path, 'wt', encoding='utf-8') # direct output sent to log_object to log file instead of sys.stdout
if '-L' in opts: # set output to specified log file or path including file name
    log_path = args[opts.index('-L')]
    log_object = open(log_path, 'wt', encoding='utf-8') # direct output sent to log_object to log file instead of sys.stdout
if '--path' in opts: # set path to input directory. If omitted, the path is the current working directory
    file_path = args[opts.index('--path')]
if '-P' in opts: # set path to input directory. If omitted, the path is the current working directory
    file_path = args[opts.index('-P')]


# --------------------------
# *** For now, hard-code logging to crossref_errors.txt in the section that invokes the functions
#log_path = 'crossref_errors.txt'
#log_object = open(log_path, 'wt', encoding='utf-8')

# Also hard-coding the path until moved out of Jupyter notebook environment, see below
# --------------------------

# List of known work types used by CrossRef
work_types = [
    {
    'crossref_type_string': 'journal-article',
    'qid': 'Q18918145', # academic journal article, alternatively Q13442814 scholarly article
    'description': 'journal article'
    },
    {
    'crossref_type_string': 'book-chapter',
    'qid': 'Q21481766', # "academic chapter"
    'description': 'academic book chapter'
    },
    {
    'crossref_type_string': 'monograph',
    'qid': 'Q193495', # monograph
    'description': 'monograph'
    }

]

# Output CSV label/description fields to be populated without references
out_fields_labels = ['label_' + default_language, 'description_' + default_language]

# Output CSV property fields to be populated without references
out_fields_noref = ['instance_of']

# Output CSV fields that include reference fields
out_fields_ref = ['doi', 'pmid', 'published', 'title_' + default_language, 'journal', 'volume', 'page', 'issue']
    
# ------------------------
# Utility functions
# ------------------------

# generates a dictionary to be passed in a requests GET method to generate the request header
def generate_header_dictionary(accept_media_type):
    request_header_dictionary = {
        'Accept' : accept_media_type,
        'User-Agent': user_agent_header
    }
    return request_header_dictionary

# Determine the current CrossRef rate limit by an initial ping
response = requests.get('https://api.crossref.org/works/10.3233/SW-150203', headers=generate_header_dictionary(accept_media_type))
crossref_headers = response.headers
limit_count = int(crossref_headers['x-rate-limit-limit'])
interval_string = crossref_headers['x-rate-limit-interval']
interval_sec = int(interval_string[:len(interval_string)-1]) # remove the "s" from the end
api_sleep = interval_sec / limit_count + 0.005

# Due to problems with direct POST of UTF-8, changed to POST with URL-encoded parameters
# See https://www.w3.org/TR/sparql11-protocol/#update-via-post-urlencoded
# and https://stackoverflow.com/questions/34618149/post-unicode-string-to-web-service-using-python-requests-library

# NOTE: there are still some issues that have not been worked out with quotation marks in query strings.
# Still working on this; see also the send_sparql_query() below.
def generate_sparql_header_dictionary(accept_media_type,user_agent_header):
    request_header_dictionary = {
        'Accept' : accept_media_type,
#        'Content-Type': 'application/sparql-query',
        'Content-Type': 'application/x-www-form-urlencoded',
        'User-Agent': user_agent_header
    }
    return request_header_dictionary

# The following function requires the request header generated above
sparql_request_header = generate_sparql_header_dictionary(accept_media_type, user_agent_header)

# Generate the current UTC xsd:date
def generate_utc_date():
    whole_time_string_z = datetime.datetime.utcnow().isoformat() # form: 2019-12-05T15:35:04.959311
    date_z = whole_time_string_z.split('T')[0] # form 2019-12-05
    return date_z

# Extracts the local name part of an IRI, e.g. a qNumber from a Wikidata IRI
def extract_local_name(iri):
    # pattern is http://www.wikidata.org/entity/Q6386232
    pieces = iri.split('/')
    last_piece = len(pieces)
    return pieces[last_piece - 1]

# Open a CSV and read in the header fields as a list
def get_csv_fieldnames(filename):
    with open(filename, 'r', newline='', encoding='utf-8') as file_object:
        reader_object = csv.reader(file_object)
        for row in reader_object: # iterate through only the first row (the header)
            return row

# Read from a CSV file into a list of dictionaries
def read_dicts_from_csv(filename):
    with open(filename, 'r', newline='', encoding='utf-8') as file_object:
        dict_object = csv.DictReader(file_object)
        array = []
        for row in dict_object:
            array.append(row)
    return array

# Write list of dictionaries to a CSV file
def write_dicts_to_csv(table, filename, fieldnames):
    with open(filename, 'w', newline='', encoding='utf-8') as csv_file_object:
        writer = csv.DictWriter(csv_file_object, fieldnames=fieldnames)
        writer.writeheader()
        for row in table:
            writer.writerow(row)

# Load JSON file data from local drive into a Python data structure
def load_json_into_data_struct(path):
    with open(path, 'rt', encoding='utf-8') as file_object:
        file_text = file_object.read()
    structure = json.loads(file_text)
    # uncomment the following line to view the data
    # print(json.loads(structure, indent = 2))
    return(structure)

# Sends a query to the query service endpoint. 
# NOTE: request_header and endpoint are global variables defined earlier in the script
def send_sparql_query(query_string):
    # You can delete the two print statements if the queries are short. However, for large/long queries,
    # it's good to let the user know what's going on.
    #print('querying SPARQL endpoint to acquire item metadata')
    #response = requests.post(endpoint, data=query_string.encode('utf-8'), headers=sparql_request_header)
    response = requests.post(endpoint, data=dict(query=query_string), headers=sparql_request_header)
    #print(response.text) # uncomment to view the raw response, e.g. if you are getting an error
    data = response.json()

    # Extract the values from the response JSON
    results = data['results']['bindings']
    
    #print('done retrieving data')
    # print(json.dumps(results, indent=2))
    
    sleep(sparql_sleep) # delay to avoid hitting the Query Service too fast
    return results

# ------------------------
# CrossRef search-related  functions
# ------------------------

# Use the CrossRef registration agency test to find out who issued the non-CrosRef DOI
def discover_issuing_agency(doi):
    encoded_doi = urllib.parse.quote(doi)
    search_url = 'https://api.crossref.org/works/' + encoded_doi + '/agency'
    response = requests.get(search_url, headers=generate_header_dictionary(accept_media_type))
    sleep(api_sleep) # delay to avoid hitting the API faster than acceptable rate
    if response.status_code == 404: # return "not found" if URL doesn't dereference
        return 'not found'
    try: # Try to parse as JSON
        data = response.json()
        return data['message']['agency']['id']
    except: # if the response isn't JSON, then just return the response text
        return response.text

# See https://github.com/CrossRef/rest-api-doc for API details
# Note: no authentication required, but must be "nice": observe rate limit, provide mailto:
def retrieve_crossref_data(doi):
    crossref_endpoint_url = 'https://api.crossref.org/works/'
    # urllib.parse.quote performs URL encoding of a string
    encoded_doi = urllib.parse.quote(doi)
    search_url = crossref_endpoint_url + encoded_doi
    response = requests.get(search_url, headers=generate_header_dictionary(accept_media_type))
    article_dict = {}
    if response.status_code == 404: # return empty dict if not found
        # *** NOTE: at some future time, look up data at alternative issuing agencies
        # For example, see https://support.datacite.org/docs/api-get-doi
        
        # For now, just log it
        print(doi, 'not CrossRef. DOI issuing agency:', discover_issuing_agency(doi), '\n', file=log_object)
        sleep(api_sleep)
        return article_dict
    else:
        author_list = []
        try:
            response_structure = response.json()
            data = response_structure['message']
        except:
            # if not JSON, just return the response text
            article_dict['message'] = response.text
            sleep(api_sleep)
            return article_dict
        #print(json.dumps(data, indent = 2))
        article_dict['doi'] = doi
        if 'author' in data:
            authors = data['author']
            for author in authors:
                authorDict = {}
                if 'ORCID' in author:
                    # Note: in CrossRef, the ORCIDs are given as full IRIs, so ORCID string needs to be extracted
                    authorDict['orcid'] = extract_local_name(author['ORCID'])
                else:
                    authorDict['orcid'] = ''
                if 'sequence' in author:
                    authorDict['sequence'] = author['sequence']
                else:
                    authorDict['sequence'] = ''
                if 'given' in author:
                    authorDict['givenName'] = author['given']
                else:
                    authorDict['givenName'] = ''
                if 'family' in author:
                    authorDict['familyName'] = author['family']
                else:
                    authorDict['familyName'] = ''
                affiliationList = []
                if 'affiliation' in author:
                    for affiliation in author['affiliation']:
                        affiliationList.append(affiliation['name'])
                # if there aren't any affiliations, the list will remain empty
                authorDict['affiliation'] = affiliationList
                author_list.append(authorDict)
            article_dict['authors'] = author_list
        '''
        if 'issued' in data:
            issued = data['issued']['date-parts'][0]
            issued_date = str(issued[0])
            if len(issued) > 1:
                if len(str(issued[1])) == 1:
                    issued_date += '-0'+ str(issued[1])
                else:
                    issued_date += '-'+ str(issued[1])
                if len(issued) > 2:                
                    if len(str(issued[2])) == 1:
                        issued_date += '-0'+ str(issued[2])
                    else:
                        issued_date += '-'+ str(issued[2])
            article_dict['published'] = issued_date
        else:
            article_dict['published'] = ''
        '''
        if 'created' in data:
            article_dict['published'] = data['created']['date-time'][0:10] # get date part from dateTime
        else:
            article_dict['published'] = ''            
        if 'volume' in data:
            article_dict['volume'] = data['volume']
        else:
            article_dict['volume'] = ''
        if 'issue' in data:
            article_dict['issue'] = data['issue']
        else:
            article_dict['issue'] = ''
        if 'page' in data:
            article_dict['page'] = data['page']
        else:
            article_dict['page'] = ''
        if 'ISSN' in data:
            article_dict['journal_issn'] = data['ISSN']
        else:
            article_dict['journal_issn'] = []
        if 'title' in data:
            if len(data['title']) > 0:
                article_dict['title_' + default_language] = data['title'][0]
                article_dict['label_' + default_language] = data['title'][0]
        else:
            article_dict['title_' + default_language] = ''
            article_dict['label_' + default_language] = ''
        if 'container-title' in data:
            if len(data['container-title']) > 0:
                article_dict['journal_title'] = data['container-title'][0]
        else:
            article_dict['journal_title'] = ''
         
        if 'type' in data:
            found = False
            for work_type in work_types:
                if data['type'] == work_type['crossref_type_string']:
                    found = True
                    article_dict['instance_of'] = work_type['qid']
                    article_dict['description_' + default_language] = work_type['description']
            if not found:
                article_dict['instance_of'] = ''
                article_dict['description_' + default_language] = ''
        else:
            article_dict['instance_of'] = ''
            article_dict['description_' + default_language] = ''

    sleep(api_sleep)
    return article_dict

def extract_doi_metadata(crossref_results, doi, pmid, today, alt_reference):
    crossref_results = extract_journal_qid(crossref_results)
    #print(crossref_results)
    
    out_dict = {'qid': ''}
    for field in out_fields_labels:   
        #print(field, crossref_results[field])
        out_dict[field] = crossref_results[field]
    #print()
    for field in out_fields_noref:   
        #print(field, crossref_results[field])
        out_dict[field + '_uuid'] = ''
        out_dict[field] = crossref_results[field]
    #print()
    for field in out_fields_ref:
        #print(field, crossref_results[field])
        out_dict[field + '_uuid'] = ''
        if field == 'published':
            out_dict[field + '_nodeId'] = ''
            out_dict[field + '_val'] = crossref_results[field]
            out_dict[field + '_prec'] = ''
        else:
            out_dict[field] = crossref_results[field]
        # Only add a reference if there is a value for that field
        if crossref_results[field] == '':
            out_dict[field + '_ref1_hash'] = ''
            out_dict[field + '_ref1_referenceUrl'] = ''
            out_dict[field + '_ref1_retrieved_nodeId'] = ''
            out_dict[field + '_ref1_retrieved_val'] = ''
            out_dict[field + '_ref1_retrieved_prec'] = ''
        else:
            out_dict[field + '_ref1_hash'] = ''
            if field == 'pmid':
                out_dict[field + '_ref1_referenceUrl'] = 'https://pubmed.ncbi.nlm.nih.gov/' + crossref_results['pmid'] + '/'
            else:
                if alt_reference == '':
                    out_dict[field + '_ref1_referenceUrl'] = 'http://doi.org/' + doi
                else:
                    out_dict[field + '_ref1_referenceUrl'] = alt_reference
            out_dict[field + '_ref1_retrieved_nodeId'] = ''
            out_dict[field + '_ref1_retrieved_val'] = today
            out_dict[field + '_ref1_retrieved_prec'] = ''
    return(out_dict)

# ------------------------
# Wikidata search-related  functions
# ------------------------

# This function just returns either True or False depending on whether there is an item in Wikidata with a DOI
def doi_in_wikidata(doi):
    query_string = '''ask {
    {?qid wdt:P356 "''' + doi + '''".}
UNION
    {?qid wdt:P356 "''' + doi.upper() + '''".}
    }'''
    response = requests.post(endpoint, data=dict(query=query_string), headers=sparql_request_header)
    data = response.json()['boolean']
    return(data)

# Look up the ISSN from CrossRef in Wikidata
def extract_journal_qid(crossref_results):
    if len(crossref_results['journal_issn']) == 0:
        crossref_results['journal'] = ''
        print('article:', crossref_results['label_' + default_language], 'has no ISSN.\n', file=log_object)
        return crossref_results

    # Create VALUES list for items
    issns_string = ''
    for issn in crossref_results['journal_issn']:
        issns_string += '"' + issn + '"\n'
    # Remove trailing newline
    issns_string = issns_string[:len(issns_string)-1]

    # Build query string
    query_string = '''select distinct ?journal ?journalLabel where {
      VALUES ?issn
        {
    ''' + issns_string + '''
        }
      ?journal wdt:P236 ?issn.
      ?journal rdfs:label ?journalLabel.
      filter(lang(?journalLabel)="en")
    }'''
    #print(query_string)

    # Send query to endpoint
    query_results = send_sparql_query(query_string)
    #pp.pprint(query_results)

    if len(query_results) > 1:
        print('Warning! More than one publication in Wikidata matched the ISSN for article', crossref_results['label_' + default_language], file=log_object)
        print(query_results, '\n', file=log_object)

    # Extract Q ID from SPARQL query results. If there is more than one result, the last one will be used for the Q ID
    for result in query_results:
        journal_qid = extract_local_name(result['journal']['value'])
        journal_name = result['journalLabel']['value']
        if journal_name != crossref_results['journal_title']:
            # NOTE: did empirical testing to see which kind of fuzzy matching worked best
            #ratio = fuzz.ratio(journal_name, crossref_results['journal_title'])
            #partial_ratio = fuzz.partial_ratio(journal_name, crossref_results['journal_title'])
            #sort_ratio = fuzz.token_sort_ratio(journal_name, crossref_results['journal_title'])
            #set_ratio = fuzz.token_set_ratio(journal_name, crossref_results['journal_title'])
            w_ratio = fuzz.WRatio(journal_name, crossref_results['journal_title'])
            #print('name similarity ratio', ratio)
            #print('partial ratio', partial_ratio)
            #print('sort_ratio', sort_ratio)
            #print('set_ratio', set_ratio)
            if w_ratio < 99:
                print('article:', crossref_results['label_' + default_language], 'w_ratio:', w_ratio, 'Warning: Wikidata journal: "' + journal_name + '"', journal_qid, 'does not match CrossRef journal title: "' + crossref_results['journal_title'] + '"\n', file=log_object)
        #print('article:', crossref_results['label_' + default_language], 'journal:', journal_qid, journal_name)
    crossref_results['journal'] = journal_qid
    return crossref_results

def generate_name_alternatives(name):
    # treat commas as if they were spaces
    name = name.replace(',', ' ')
    # get rid of periods, sometimes periods are close up with no spaces
    name = name.replace('.', ' ')

    pieces = name.split(' ')
    while '' in pieces:
        pieces.remove('')
    
    # Remove ", Jr.", "III", etc. from end of name
    if pieces[len(pieces)-1] == 'Jr':
        pieces = pieces[0:len(pieces)-1]
        suffix = ', Jr.'
    elif pieces[len(pieces)-1] == 'II':
        pieces = pieces[0:len(pieces)-1]
        suffix = ' II'
    elif pieces[len(pieces)-1] == 'III':
        pieces = pieces[0:len(pieces)-1]
        suffix = ' III'
    elif pieces[len(pieces)-1] == 'IV':
        pieces = pieces[0:len(pieces)-1]
        suffix = ' IV'
    elif pieces[len(pieces)-1] == 'V':
        pieces = pieces[0:len(pieces)-1]
        suffix = ' V'
    elif len(pieces) > 3 and pieces[len(pieces)-2] == 'the' and pieces[len(pieces)-1] == 'elder':
        pieces = pieces[0:len(pieces)-2]
        suffix = ' the elder'
    else:
        suffix = ''

    # generate initials for all names
    initials = []
    for piece in pieces:
        # make sure first character is alphabetic
        # only fixes the case where there is one alphanumeric, but more than one is rare
        # typical cases are like (Kit) or "Kit"
        if not piece[0:1].isalpha():
            piece = piece[1:len(piece)] # remove the first non-alphabetic character
        if len(piece) > 0:
            initials.append(piece[0:1])
        
    alternatives = []
    # full name
    name_version = ''
    for piece_number in range(0, len(pieces)-1):
        name_version += pieces[piece_number] + ' '
    name_version += pieces[len(pieces)-1]
    alternatives.append(name_version)
    
    # full name with suffix
    if suffix != '':
        name_version = ''
        for piece_number in range(0, len(pieces)-1):
            name_version += pieces[piece_number] + ' '
        name_version += pieces[len(pieces)-1] + suffix
        alternatives.append(name_version)
    
    # first and last name with initials
    name_version = pieces[0] + ' '
    for piece_number in range(1, len(pieces)-1):
        name_version += initials[piece_number] + ' '
    name_version += pieces[len(pieces)-1]
    alternatives.append(name_version)
    
    # first and last name with initials and periods
    name_version = pieces[0] + ' '
    for piece_number in range(1, len(pieces)-1):
        name_version += initials[piece_number] + '. '
    name_version += pieces[len(pieces)-1]
    alternatives.append(name_version)

    # first and last name only
    name_version = pieces[0] + ' '
    name_version += pieces[len(pieces)-1]
    alternatives.append(name_version)

    # first initial and last name only
    name_version = initials[0] + ' '
    name_version += pieces[len(pieces)-1]
    alternatives.append(name_version)

    # first initial with period and last name only
    name_version = initials[0] + '. '
    name_version += pieces[len(pieces)-1]
    alternatives.append(name_version)

    # all name initials with last name
    name_version = initials[0] + ' '
    for piece_number in range(1, len(pieces)-1):
        name_version += initials[piece_number] + ' '
    name_version += pieces[len(pieces)-1]
    alternatives.append(name_version)

    # all name initials with periods with last name
    name_version = ''
    for piece_number in range(0, len(pieces)-1):
        name_version += initials[piece_number] + '. '
    name_version += pieces[len(pieces)-1]
    alternatives.append(name_version)

    # all name initials concatenated with last name
    name_version = ''
    for piece_number in range(0, len(pieces)-1):
        name_version += initials[piece_number]
    name_version += ' ' + pieces[len(pieces)-1]
    alternatives.append(name_version)
    
    # remove duplicates
    dedupe = list(set(alternatives))

    return dedupe

def searchWikidataForQIdByOrcid(orcid):
    query_string = '''
select distinct ?qid ?label ?description where {
    ?qid wdt:P496 "''' + orcid + '''".
    ?qid rdfs:label ?label.
    FILTER(lang(?label)="'''+ default_language + '''")
    OPTIONAL {
    ?qid schema:description ?description.
    FILTER(lang(?description)="'''+ default_language + '''")
    }
      }
    '''
    #print(query_string)

    results = send_sparql_query(query_string)
    #print(json.dumps(results, indent=2))

    if len(results) > 1:
        print('Warning!!! Multiple items with same ORCID!')
        print(results)
    if len(results) == 0:
        out_dict = {}        
    else:
        out_dict = {
            'qid': extract_local_name(results[0]['qid']['value']),
            'label': results[0]['label']['value']
            }
        if 'description' in results[0]:
            out_dict['description'] = results[0]['description']['value']
        else:
            out_dict['description'] = ''           
    return out_dict

def search_name_at_wikidata(name):
    # carry out search for most languages that use Latin characters, plus some other commonly used languages
    # See https://doi.org/10.1145/3233391.3233965
    language_codes = [
        'en',
        'es',
        'pt',
        'fr',
        'it',
        'nl',
        'de',
        'da',
        'et',
        'hu',
        'ga',
        'ro',
        'sk',
        'sl',
        'zu',
        'tr',
        'sv',
        'zh',
        'ru',
        'ja',
        'ar',
        'pl',
        'uk',
        'ca',
        'cs',
        'la',
        'nb',
        'hu',
        'he',
        'eo',
        'fi',
        'ko'
      ]
    name_list = generate_name_alternatives(name)
    alternatives = ''
    for language_code in language_codes:
        for alternative in name_list:
            # get rid of quotes, which will break the query
            alternative = alternative.replace('"', '')
            alternative = alternative.replace("'", '')
            alternatives += '"' + alternative + '"@' + language_code + '\n'
    query = '''
select distinct ?item ?label where {
  VALUES ?value
  {
  ''' + alternatives + '''}
?item rdfs:label|skos:altLabel ?value.
?item rdfs:label ?label.
FILTER(lang(?label)='en')
  }
'''
    #print(query)
    #print('searching for ', name)
    results = []
#    r = requests.post(endpoint, data=query.encode('utf-8'), headers=sparql_request_header)
    r = requests.post(endpoint, data=dict(query=query), headers=sparql_request_header)
    try:
        data = r.json()
        #print(data)
        statements = data['results']['bindings']
        for statement in statements:
            wikidata_iri = statement['item']['value']
            if 'label' in statement:
                name = statement['label']['value']
            else:
                name = ''
            qnumber = extract_local_name(wikidata_iri)
            results.append({'qid': qnumber, 'name': name})
    except:
        results = [{'error': r.text}]
    # delay a quarter second to avoid hitting the SPARQL endpoint too rapidly
    sleep(sparql_sleep)
    return results

def find_surname_givens(name):
    # Get rid of periods and commas
    name = name.replace('.', ' ')
    name = name.replace(',', ' ')
    
    # Split name
    pieces = name.split(' ')
    
    # Get rid of empty pieces formed from extra spaces
    while '' in pieces:
        pieces.remove('')
    
    # Must be at least a surname and something else
    if len(pieces) <= 1:
        return False
    
    # Make sure first character is alphabetic
    # only fixes the case where there is one alphanumeric, but more than one is rare
    # typical cases are like (Kit) or "Kit"    
    for piece_index in range(len(pieces)):
        if not pieces[piece_index][0:1].isalpha(): 
            pieces[piece_index] = pieces[piece_index][1:len(pieces)] # remove the first non-alphabetic character
    # Now get rid of any empty strings; could also be caused by double spaces
    for piece in pieces:
        if len(piece) == 0: # there's nothing left, get rid of piece
            pieces.remove('')
            
    # Get rid of ", Jr.", "III", etc.
    if 'Jr' in pieces:
        pieces.remove('Jr')
    if 'Sr' in pieces:
        pieces.remove('Sr')
    if 'II' in pieces:
        pieces.remove('II')
    if 'III' in pieces:
        pieces.remove('III')
    if 'IV' in pieces:
        pieces.remove('IV')
    if 'V' in pieces:
        pieces.remove('V')
    
    # Not interested unless there are at least two pieces
    if len(pieces) == 1:
        return False
    
    # Put all but last piece together again
    given_names = ''
    for piece in pieces[0:len(pieces)-2]:
        given_names += piece + ' '
    given_names += pieces[len(pieces)-2]
    
    return {'given': given_names, 'family': pieces[len(pieces)-1]}

def screen_qids(qids, screens):
    qid_values =''
    for qid in qids:
        qid_values += 'wd:' + qid + '\n'

    graph_pattern = ''
    first_screen = True
    for screen in screens:
        # Each requirement in a screen has an AND relationship (all must be satisfied)
        subgraph_pattern = ''
        for requirement in screen:

            # Set the value if required or use a dummy variable if any value is allowed
            if requirement['entity'] == '':
                value = '?var' + requirement['property'] # add the property string to the variable to guarantee uniqueness
            elif re.match(r'Q\d+', requirement['entity']): # regex to match Q IDs
                value = 'wd:' + requirement['entity']
            else: # if not nothing or a Q ID, assume it's a string literal
                if requirement['lang'] == '':
                    value = '"' + requirement['entity'] + '"'
                else:
                    value = '"' + requirement['entity'] + '"@' + requirement['lang']

            # Set the property (label, description, or P value)
            if requirement['property'] == 'label':
                property = 'rdfs:label'
            elif requirement['property'] == 'description':
                property = 'schema:description'
            else:
                property = 'wdt:' + requirement['property']

            # Place the value in either the subject or object position in the triple
            if requirement['position'] == 'object':
                triple_pattern = '?qid ' + property + ' ' + value + '.'
            else:
                triple_pattern = value + ' ' + property + ' ?qid.'

            # Add filters if needed
            if requirement['filter_type'] == '<' or requirement['filter_type'] == '>': 
                # note: string comparison only e.g. for datetimes, needs modification for actual numbers
                triple_pattern += '\nFILTER (STR(?var' + requirement['property'] + ') ' + requirement['filter_type'] + ' "' + requirement['filter_string'] + '")'

            if requirement['filter_type'] == 'in': 
                # note: string comparison only
                triple_pattern += '\nFILTER (CONTAINS(?var' + requirement['property'] + ', "' + requirement['filter_string'] + '"))'

            # Use MINUS if you want to exclude items that fit the pattern.
            if requirement['require'] == 'exclude':
                triple_pattern = 'minus {' + triple_pattern + '}'

            triple_pattern += '\n'
            #print(triple_pattern)
            subgraph_pattern += triple_pattern

        # Now attach the subgraph pattern to any previous subgraph patterns using UNION to great an OR relationship
        subgraph_pattern = '{\n' + subgraph_pattern + '}\n' # create a subgraph pattern so that several can be UNIONed
        if first_screen: # The first subgraph pattern doesn't need the UNION inserted
            first_screen = False
        else:
            graph_pattern = graph_pattern + 'UNION\n'
        graph_pattern += subgraph_pattern 

    query_string = '''
    select distinct ?qid ?label ?description where {
      VALUES ?qid
      {
      ''' + qid_values + '''}
    ''' + graph_pattern + '''
    
    ?qid rdfs:label ?label.
    FILTER(lang(?label)="'''+ default_language + '''")
    
    OPTIONAL {
    ?qid schema:description ?description.
    FILTER(lang(?description)="'''+ default_language + '''")
    }
      }
    '''
    #print(query_string)

    results = send_sparql_query(query_string)
    #print(json.dumps(results, indent=2))

    return_list = []
    for result in results:
        out_dict = {
            'qid': extract_local_name(result['qid']['value']),
            'label': result['label']['value']
            }
        if 'description' in result:
            out_dict['description'] = result['description']['value']
        else:
            out_dict['description'] = ''           
        return_list.append(out_dict)
    return return_list

# returns a list of results of articles by person with Wikidata ID qid
def search_wikidata_article(qid):
    results_list = []
    # P50 is "author"; P698 is the PubMed ID of the article
    query = '''select distinct ?title ?pmid where {
      ?article wdt:P50 wd:''' + qid + '''.
      optional {
          ?article rdfs:label ?title.
          FILTER(lang(?title) = "'''+ default_language + '''")
          }
      optional {?article wdt:P698 ?pmid.}
      }'''
    #print(query)
    r = requests.post(endpoint, data=dict(query=query), headers=sparql_request_header)
    try:
        data = r.json()
        statements = data['results']['bindings']
        for statement in statements:
            if 'title' in statement:
                title = statement['title']['value']
                #print('title=',title)
            else:
                title = ''
            if 'pmid' in statement:
                pmid = statement['pmid']['value']
            else:
                pmid = ''
            results_list.append({'title': title, 'pmid': pmid})
    except:
        results_list = [r.text]
    # delay a quarter second to avoid hitting the SPARQL endpoint too rapidly
    sleep(sparql_sleep)
    return results_list

# returns lists of occupations, employers, and affiliations for a person with Wikidata ID qid
def search_wikidata_occ_emp_aff(qid):
    results_list = []

    query_string = '''select distinct ?occupation ?employer ?affiliation where {
        optional {
            wd:'''+ qid + ''' wdt:P106 ?occupationId.
            ?occupationId rdfs:label ?occupation.
            FILTER(lang(?occupation) = "'''+ default_language + '''")            
            }

        optional {
            wd:'''+ qid + ''' wdt:P108 ?employerId.
            ?employerId rdfs:label ?employer.
            FILTER(lang(?employer) = "'''+ default_language + '''")            
            }

        optional {
            wd:'''+ qid + ''' wdt:P1416 ?affiliationId.
            ?affiliationId rdfs:label ?affiliation.
            FILTER(lang(?affiliation) = "'''+ default_language + '''")            
            }
        }'''
    
    #print(query_string)
    statements = send_sparql_query(query_string)
    #print(statements)
    
    # pull all possible occupations
    occupationList = []
    employerList = []
    affiliationList = []
    for statement in statements:
        if 'occupation' in statement:
            occupationList.append(statement['occupation']['value'])
        if 'employer' in statement:
            employerList.append(statement['employer']['value'])
        if 'affiliation' in statement:
            affiliationList.append(statement['affiliation']['value'])
    occupationList = list(set(occupationList))
    employerList = list(set(employerList))
    affiliationList = list(set(affiliationList))
    #print(occupationList)
    #print(employerList)
    #print(affiliationList)
    
    # delay to avoid hitting the SPARQL endpoint too rapidly
    sleep(sparql_sleep)
    return occupationList, employerList, affiliationList 

# ------------------------
# PubMed-related functions
# ------------------------

def retrieve_pubmed_id(doi):
    fetch_url = 'https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/'
    param_dict = {
        'tool': tool_name, 
        'email': email_address,
        'format': 'json',
        'ids': doi
    }
    response = requests.get(fetch_url, params=param_dict)    
    #print(response.url)
    if response.status_code == 404:
        pmid = '' # return an empty string if the constructed URL won't dereference
    else:
        try:
            response_json = response.json()
            pmid = response_json['records'][0]['pmid']
        except:
            pmid = ''

    # See https://www.ncbi.nlm.nih.gov/books/NBK25497/ for usage guidelines. 
    # An API key is required for more than 3 requests per second.
    sleep(0.35) # wait half a second before hitting the API again to avoid getting blocked
    return pmid

def retrieve_pubmed_data(pmid):
    fetch_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
    param_dict = {
        'tool': tool_name, 
        'email': email_address,
        'db': 'pubmed', 
         #'retmode': 'xml', 
        'rettype': 'abstract', 
        'id': pmid
    }
    response = requests.get(fetch_url, params=param_dict)    
    #print(response.url)
    if response.status_code == 404:
        affiliations = [] # return an empty list if the constructed URL won't dereference
    else:
        pubData = response.text  # the response text is XML
        #print(pubData)  # uncomment this line to see the XML

        # process the returned XML, see https://docs.python.org/2/library/xml.etree.elementtree.html
        root = et.fromstring(pubData)
        try:
            title = root.findall('.//ArticleTitle')[0].text
        except:
            title = ''
        names = root.findall('.//Author')
        affiliations = []
        for name in names:
            try:
                affiliation = name.find('./AffiliationInfo/Affiliation').text
            except:
                affiliation = ''
            try:
                surname = name.find('./LastName').text
            except:
                surname = ''
            try:
                forename = name.find('./ForeName').text
            except:
                forename = ''
            try:
                id_field = name.find('./Identifier')
                if id_field.get('Source') == 'ORCID':
                    orcid = id_field.text
                else:
                    orcid = ''
            except:
                orcid = ''

            #print(lastName)
            #print(affiliation)
            affiliations.append({'affiliation': affiliation, 'surname': surname, 'forename': forename, 'orcid': orcid})
        #print()

    # See https://www.ncbi.nlm.nih.gov/books/NBK25497/ for usage guidelines. 
    # An API key is required for more than 3 requests per second.
    sleep(0.35) # wait before hitting the API again to avoid getting blocked
    #print(affiliations)
    return affiliations

# NOTE: The affiliation is a list of strings. All other arguments are strings
def identified_in_pubmed(pmid, name, affiliations, orcid):
    department_test_ratio = 70 # ratio required when a generic name similarity is crosschecked with dept name
    test_ratio = 90 # similarity required for a potential match of a generic wikidata match
    screen = False
    potentialOrcid = ''

    #print('Checking authors in PubMed article: ', pmid)
    pubmed_authors = retrieve_pubmed_data(pmid)
    if pubmed_authors == []:
        print('PubMed ID does not seem to be valid.')
    #print(pubmed_authors)
    for pubmed_author in pubmed_authors:
        # Perform a check based on pubmed_author surnames and departments. 
        # Note: only SURNAME is checked, so coauthor problems are possible as above.
        # More complex checking could be done by looking up the name in ORCID, if available.
        # Always report, but only match when person and department names are similar.
        name_test_ratio = fuzz.token_set_ratio(pubmed_author['surname'], name)
        #print(nameTestRatio, pubmed_author['surname'])
        if name_test_ratio >= test_ratio:
            if pubmed_author['orcid'] != '' and orcid != '':
                # both employee and pubmed_author must have ORCIDs to do this check
                if orcid != extract_local_name(pubmed_author['orcid']):
                    # Reject the article if the matched surname has an inconsistent ORCID
                    print('*** ' + pubmed_author['forename'] + ' ' + pubmed_author['surname'] + ' is NOT the same person; ORCID ' + pubmed_author['orcid'] + ' does not match.')
                    return screen
                # If the PubMed metadata gives an ORCID for the matched person, record it
                else:
                    print(pubmed_author['forename'] + ' ' + pubmed_author['surname'] + ' has matching ORCID ' + pubmed_author['orcid'])
                    screen = True
                    return screen # don't continue the loop since ORCIDs match

            # If there is an affiliation, display it. 
            # If the department name matches the affiliation, call it a match
            if pubmed_author['affiliation'] != '': 
                for affiliation in affiliations:
                    set_ratio = fuzz.token_set_ratio(affiliation, pubmed_author['affiliation'])
                    print('Affiliation test: ', set_ratio, pubmed_author['affiliation'])
                    if set_ratio >= department_test_ratio:
                        print('*** pubmed_author/affiliation match!')
                        screen = True
                        return screen # don't continue the loop (look up pubmed_author) since it's an affiliation match

    return screen

# ------------------------
# Complex functions
# ------------------------

def disambiguate_authors(doi, authors, pmid):
    filename = 'researchers.csv'
    researchers = read_dicts_from_csv(file_path + filename)

    filename = 'vanderbilt_wikidata_altlabels.csv'
    altnames = read_dicts_from_csv(file_path + filename)

    filename = 'departments.csv'
    departments = read_dicts_from_csv(file_path + filename)

    filename = 'department_labels.csv'
    department_labels = read_dicts_from_csv(file_path + filename)

    max_pmids_to_check = 10
    # If there is a PubMed ID for the article, retrieve the author info
    if pmid != '':
        pubmed_author_info = retrieve_pubmed_data(pmid)
        print('retrieved data from PubMed ID', pmid)
        for author_index in range(len(pubmed_author_info)):
            pubmed_author_info[author_index]['name'] = pubmed_author_info[author_index]['forename'] + ' ' + pubmed_author_info[author_index]['surname']
    else:
        print('no PubMed data')

    #crossref_results['pmid'] = pmid

    # Augment CrossRef data with PubMed data. Typically the PubMed data is more likely to have the affiliations
    # Names are generally very similar, but vary with added or missing periods on initials and suffixes
    if pmid != '':
        for author_index in range(len(authors)):
            found = False
            crossref_name = authors[author_index]['givenName'] + ' ' + authors[author_index]['familyName']
            #print(crossref_name)
            for pubmed_author in pubmed_author_info:
                ratio = fuzz.ratio(pubmed_author['name'], crossref_name)
                #print(ratio, pubmed_author['name'])
                if ratio > 87: # had to drop down to this level because some people with missing "Jr" weren't matching
                    found = True
                    result_string = 'fuzzy label match: ' + str(ratio) + pubmed_author['name'] + ' / ' + crossref_name
                    #print(result_string)
                    break
            if not found:
                print('Did not find a match in the PubMed data for', crossref_name)
            else:
                #print(pubmed_author)
                #print(authors[author_index])

                # If there is a PubMed affiliation and no affiliation in the CrossRef data, add the PubMed affiliation
                if pubmed_author['affiliation'] != '':
                    if len(authors[author_index]['affiliation']) == 0:
                        authors[author_index]['affiliation'].append(pubmed_author['affiliation'])

                # If there is an ORCID in PubMed and no ORCID in the CrossRef data, add the ORCID to CrossRef data
                # Not sure how often this happens since I think maybe usually of one has it, the other does, too.
                if pubmed_author['orcid'] != '':
                    if authors[author_index]['orcid'] == '':
                        authors[author_index]['orcid'] = pubmed_author['orcid']

                #print(authors[author_index])

            #print()
    #print(json.dumps(pubmed_author_info, indent=2))


    # screens.json is a configuration file that defines the kinds of screens to be performed on potential Q ID matches from Wikidata
    screens = load_json_into_data_struct('screens.json')

    # Perform screening operations on authors to try to determine their Q IDs
    found_qid_values = []
    not_found_author_list = []
    author_count = 1
    for author in authors:
        print(author_count)
        found = False
        
        # First eliminate the case where all of the name pieces are empty
        if (author['givenName'] + ' ' + author['familyName']).strip() == '':
            break
        # screen for exact match to Wikidata labels
        for researcher in researchers:
            if researcher['label_en'] == author['givenName'] + ' ' + author['familyName']:
                found = True
                result_string = 'researcher exact label match: ' + researcher['qid'] + ' ' + researcher['label_en']
                name = researcher['label_en']
                qid = researcher['qid']
                break
        if not found:
            # screen for exact match to alternate names
            for altname in altnames:
                if altname['altLabel'] == author['givenName'] + ' ' + author['familyName']:
                    found = True
                    result_string = 'researcher altname match: ' + altname['qid'] + ' ' + altname['altLabel']
                    name = altname['altLabel']
                    qid = altname['qid']
                    break
            if not found:
                # If the researcher has an ORCID, see if it's at Wikidata
                if author['orcid'] != '':
                    hit = searchWikidataForQIdByOrcid(author['orcid'])
                    if hit != {}:
                        found = True
                        result_string = 'Wikidata ORCID search: ' + hit['qid'] + ' ' + hit['label'] + ' / ' + hit['description']
                        name = hit['label']
                        qid = hit['qid']

                if not found:
                    # screen for fuzzy match to Wikidata-derived labels
                    for researcher in researchers:
                        # Require the surname to match the label surname exactly
                        split_names = find_surname_givens(researcher['label_en']) # returns False if no family name
                        if split_names: # skip names that don't have 2 parts !!! also misses non-English labels!
                            if split_names['family'] == author['familyName']: # require exact match to family name
                                w_ratio = fuzz.ratio(author['givenName'] + ' ' + author['familyName'], researcher['label_en'])
                                #w_ratio = fuzz.WRatio(author['givenName'] + ' ' + author['familyName'], researcher['label_en'])
                                if w_ratio > 90:
                                    found = True
                                    result_string = 'fuzzy label match: ' + str(w_ratio) + ' ' + researcher['qid'] + ' ' + researcher['label_en'] + ' / ' + author['givenName'] + ' ' + author['familyName']
                                    name = researcher['label_en']
                                    qid = researcher['qid']
                                    break
                    if not found:
                        # screen for fuzzy match to alternate names
                        for altname in altnames:
                            split_names = find_surname_givens(altname['altLabel'])
                            if split_names: # skip names that don't have 2 parts
                                if split_names['family'] == author['familyName']: # require exact match to family name
                                    w_ratio = fuzz.ratio(author['givenName'] + ' ' + author['familyName'], altname['altLabel'])
                                    #w_ratio = fuzz.WRatio(author['givenName'] + ' ' + author['familyName'], altname['altLabel'])
                                    if w_ratio > 90:
                                        found = True
                                        result_string = 'researcher altname fuzzy match: ' + str(w_ratio) + ' ' + altname['qid'] + ' ' + altname['altLabel'] + ' / ' + author['givenName'] + ' ' + author['familyName']
                                        name = altname['altLabel']
                                        qid = altname['qid']
                                        break
                        if not found:
                            name = author['givenName'] + ' ' + author['familyName']
                            print('Searching Wikidata for', name)
                            print('researcher known affiliations: ', author['affiliation'])
                            print()
                            hits = search_name_at_wikidata(name)
                            #print(hits)

                            qids = []
                            for hit in hits:
                                qids.append(hit['qid'])
                            return_list = screen_qids(qids, screens)
                            #print(return_list)

                            for hit in return_list:

                                split_names = find_surname_givens(hit['label'])

                                # Require the surname to match the Wikidata label surname exactly
                                # This prevents a high fraction of fuzzy matches where the last names are similar but not the same
                                if split_names: # skip names that don't have 2 parts
                                    if split_names['family'] == author['familyName']: # require exact match to family name
                                        #print(author['givenName'] + ' ' + author['familyName'], hit['label'])
                                        #print(hit)
                                        w_ratio = fuzz.WRatio(author['givenName'] + ' ' + author['familyName'], hit['label'])
                                        #print('w_ratio:', w_ratio)
                                        #ratio = fuzz.ratio(author['givenName'] + ' ' + author['familyName'], hit['label'])
                                        #print('ratio:', ratio)
                                        #partial_ratio = fuzz.partial_ratio(author['givenName'] + ' ' + author['familyName'], hit['label'])
                                        #print('partial_ratio:', partial_ratio)
                                        #token_sort_ratio = fuzz.token_sort_ratio(author['givenName'] + ' ' + author['familyName'], hit['label'])
                                        #print('token_sort_ratio:', token_sort_ratio)
                                        #token_set_ratio = fuzz.token_set_ratio(author['givenName'] + ' ' + author['familyName'], hit['label'])
                                        #print('token_set_ratio:', token_set_ratio)

                                        # This screen requires a high degree of similarity between the overall ORCID names and Wikidata labels
                                        if w_ratio > 80:
                                            print('Wikidata search fuzzy match:', w_ratio, author['givenName'] + ' ' + author['familyName'], ' / ', 'https://www.wikidata.org/wiki/'+ hit['qid'], hit['label'])
                                            print('Wikidata description: ', hit['description'])

                                            # Here we need to check Wikidata employer and affiliation and fuzzy match against known affiliations
                                            occupations, employers, affilliations = search_wikidata_occ_emp_aff(hit['qid'])
                                            print('occupations:', occupations)
                                            print('employers:', employers)
                                            print('affilliations', affilliations)
                                            print()

                                            # Perform a check of the employer to make sure we didn't miss somebody in the earlier
                                            # string matching
                                            for employer in employers:
                                                if 'Vanderbilt University' in employer: # catch university and med center
                                                    found = True
                                                    result_string = 'Match Vanderbilt employer in Wikidata: ' + hit['qid'] + ' ' + author['givenName'] + ' ' + author['familyName']
                                                    qid = hit['qid']

                                            # If the author doesn't have any known affiliations, there is no point in checking PubMed
                                            if author['affiliation'] != []:
                                                # Search Wikidata for articles written by this match
                                                articles_in_wikidata = search_wikidata_article(hit['qid'])
                                                #print(articles_in_wikidata)

                                                # Step through articles with PubMed IDs found in Wikidata and see if the author affiliation or ORCID matches any of the articles
                                                check = 0
                                                for article_in_wikidata in articles_in_wikidata:
                                                    if article_in_wikidata['pmid'] != '':
                                                        check += 1
                                                        if check > max_pmids_to_check:
                                                            print('More articles, but stopping after checking', max_pmids_to_check)
                                                            break # break out of article-checking loop
                                                        print('Checking article, PMID:', article_in_wikidata['pmid'], article_in_wikidata['title'])
                                                        pubmed_match = identified_in_pubmed(article_in_wikidata['pmid'], author['givenName'] + ' ' + author['familyName'], author['affiliation'], author['orcid'])
                                                        if not pubmed_match:
                                                            #print('no match')
                                                            print()
                                                        else:
                                                            found = True
                                                            result_string = 'PubMed affilation match: ' + hit['qid'] + ' ' + author['givenName'] + ' ' + author['familyName']
                                                            qid = hit['qid']
                                                            break # break out of article-checking loop

                                            if found:
                                                break # break out of hit list loop
                                            print()

        if not found:
            not_found_author_list.append({'author_string': author['givenName'] + ' ' + author['familyName'], 'series_ordinal': author_count})
            print('not found:', author['givenName'] + ' ' + author['familyName'])

        else:
            found_qid_values.append({'qid': qid, 'stated_as': author['givenName'] + ' ' + author['familyName'], 'series_ordinal': author_count})
            print(result_string)
            for department in departments:
                if qid == department['qid']:
                    for department_label in department_labels:
                        if department_label['qid'] == department['affiliation']:
                            print(department_label['label_en'])
                            break
        print()
        author_count += 1

    print()
    return found_qid_values, not_found_author_list

def retrieve_spreadsheet_data():
    # Spreadsheet has only one row
    filename = 'spreadsheet_input_article.csv'
    articles = read_dicts_from_csv(filename)
    data = articles[0]

    filename = 'spreadsheet_input_author.csv'
    authors = read_dicts_from_csv(filename)

    article_dict = {}
    author_list = []
    
    # alt_reference is used in the case where the DOI doesn't exist or won't dereference
    if 'alt_reference' in data:
        alt_reference = data['alt_reference']
    else:
        alt_reference = ''

    if 'doi' in data:
        article_dict['doi'] = data['doi']
    else:
        article_dict['doi'] = ''
        
    for author in authors:
        authorDict = {}
        if 'orcid' in author:
            authorDict['orcid'] = author['orcid']
        else:
            authorDict['orcid'] = ''
        if 'sequence' in author:
            authorDict['sequence'] = author['sequence']
        else:
            authorDict['sequence'] = ''
        if 'givenName' in author:
            authorDict['givenName'] = author['givenName']
        else:
            authorDict['givenName'] = ''
        if 'familyName' in author:
            authorDict['familyName'] = author['familyName']
        else:
            authorDict['familyName'] = ''
        affiliationList = []
        if 'affiliation' in author: # only one affiliation in table, but output as list to be compatible with DOI data
            affiliationList.append(author['affiliation'])
        else:
            affiliationList.append('')
        # if there aren't any affiliations, the list will remain empty
        authorDict['affiliation'] = affiliationList
        author_list.append(authorDict)
    article_dict['authors'] = author_list
    '''
    if 'issued' in data:
        issued = data['issued']['date-parts'][0]
        issued_date = str(issued[0])
        if len(issued) > 1:
            if len(str(issued[1])) == 1:
                issued_date += '-0'+ str(issued[1])
            else:
                issued_date += '-'+ str(issued[1])
            if len(issued) > 2:                
                if len(str(issued[2])) == 1:
                    issued_date += '-0'+ str(issued[2])
                else:
                    issued_date += '-'+ str(issued[2])
        article_dict['published'] = issued_date
    else:
        article_dict['published'] = ''
    '''
    if 'published' in data:
        article_dict['published'] = data['published']
    else:
        article_dict['published'] = ''            
    if 'volume' in data:
        article_dict['volume'] = data['volume']
    else:
        article_dict['volume'] = ''
    if 'issue' in data:
        article_dict['issue'] = data['issue']
    else:
        article_dict['issue'] = ''
    if 'page' in data:
        article_dict['page'] = data['page']
    else:
        article_dict['page'] = ''
    if 'journal_issn' in data:
        article_dict['journal_issn'] = [data['journal_issn']]
    else:
        article_dict['journal_issn'] = []
    if 'label_en' in data:
        article_dict['title_' + default_language] = data['label_en']
        article_dict['label_' + default_language] = data['label_en']
    else:
        article_dict['title_' + default_language] = ''
        article_dict['label_' + default_language] = ''
    if 'journal_title' in data:
        article_dict['journal_title'] = data['journal_title']
    else:
        article_dict['journal_title'] = ''

    if 'work_type' in data:
        found = False
        for work_type in work_types:
            if data['work_type'] == work_type['crossref_type_string']:
                found = True
                article_dict['instance_of'] = work_type['qid']
                article_dict['description_' + default_language] = work_type['description']
        if not found:
            article_dict['instance_of'] = ''
            article_dict['description_' + default_language] = ''
    else:
        article_dict['instance_of'] = ''
        article_dict['description_' + default_language] = ''

    return article_dict, alt_reference

# ------------------------
# Top level composite functions
# ------------------------

def create_article_record(doi, articles_list, stored_retrieved_authors):
    doi = doi.upper()
    today = generate_utc_date()
    data_available = False

    # Retrieve data from file - the data is used for cases where there is no DOI to be used 
    # to acquire the info from CrossRef
    alt_reference = '' # Default to no alternative reference, i.e. use the DOI IRI
    if doi == '':
        crossref_results, alt_reference = retrieve_spreadsheet_data()
        crossref_results['doi'] = crossref_results['doi'].upper()
        doi = crossref_results['doi']
        data_available = True

    # Retrieve data from CrossRef
    else:
        # Retrieve CrossRef data using DOI
        crossref_results = retrieve_crossref_data(doi)
        if crossref_results == {}:
            print('No data available from CrossRef for', doi)
        else:
            print('retrieved data from CrossRef for', doi)
            # screen for missing data that would crash the script and log problems
            if not 'authors' in crossref_results:
                print(doi + ' no author data')
                print(doi + ' no author data\n', file=log_object) # log to file
            else:
                if not 'label_' + default_language in crossref_results or crossref_results['label_' + default_language] == '':
                    print(doi + ' title missing')
                    print(doi + ' title missing\n', file=log_object) # log to file
                else:
                    if len(crossref_results['label_' + default_language]) > 250:
                        print(doi + ' label too long')
                        print(doi + ' label too long\n', file=log_object) # log to file              
                    else:
                        data_available = True

        #print(json.dumps(crossref_results, indent = 2))

    if data_available:        
        # Determine the PubMed ID if there is one
        pmid = retrieve_pubmed_id(doi)
        crossref_results['pmid'] = pmid

        # Extract metadata and create a record for the article
        primary_metadata = extract_doi_metadata(crossref_results, doi, pmid, today, alt_reference)
        articles_list.append(primary_metadata)
        #print(articles_list)

        # Writ the data to the file after every lookup in case the script crashes
        fieldnames = list(articles_list[0].keys()) # get field names from first dict in list
        write_dicts_to_csv(articles_list, file_path + 'articles.csv', fieldnames)

        # Store the author data for this article.
        # It will be retrieved from the file after the basic article metadata are writtend and the Q ID is available
        stored_retrieved_authors.append({'doi': crossref_results['doi'], 'authors': json.dumps(crossref_results['authors'])})
        write_dicts_to_csv(stored_retrieved_authors, file_path + 'stored_retrieved_authors.csv', ['doi', 'authors'])
        
    return articles_list, stored_retrieved_authors


# Retrieve DOI data from CrossRef to create new article items

In [11]:
# NOTE: This script does not (yet) check the DOI list for duplicates! They only get caught when there is an error 
# reported from the Wikidata API when VanderBot tries to upload a duplicate label/description combination.

# *** For now, hard-code logging to crossref_errors.txt
log_path = 'crossref_errors.txt'
log_object = open(log_path, 'wt', encoding='utf-8')

# *** For now, hard-code data file locations since it's a Jupyter notebook
#file_path = '/users/baskausj/github/vandycite/divinity_law/'
# Test users should comment out the line above and uncomment the following line:
file_path = ''

# This file contains the DOIs to be processed. The file can contain any columns, but the one with the Q IDs
# must have the column header "qid"
doi_source = read_dicts_from_csv(file_path + 'doi_source.csv')

# This file contains any existing article items from previous work. 
articles_list = read_dicts_from_csv(file_path + 'articles.csv')

# Create a list to store author data retrieved from CrossRef. 
# Don't need to retrieve the existing list as that will result in creating duplicate author item lines.
#stored_retrieved_authors = read_dicts_from_csv('stored_retrieved_authors.csv')
stored_retrieved_authors = []

for doi_dict in doi_source:
    doi = doi_dict['doi'].strip() # remove any leading or trailing whitespace
    if doi_in_wikidata(doi):
        print(doi, 'article already in Wikidata')
        print(doi + ' article already in Wikidata\n', file=log_object) # log to file
    else:
        # The article data and retrieved author data get passed to the function, then returned with the record for
        # the DOI appended to the end
        articles_list, stored_retrieved_authors = create_article_record(doi, articles_list, stored_retrieved_authors)

# Potentially there may need to be logging done during the author writing stage, but we'll need to be able
# to look at the log before writing the articles anyway.
log_object.close()
print('done')


retrieved data from CrossRef for 10.1371/JOURNAL.PBIO.3001417
done


# Interlude

After running the code above, the VanderBot script must be run on the `articles.csv` file.

The following code must be run, and then run the VanderBot script again to add the author and author string data.

In [13]:
# NOTE: currently only DOI data is being used here. alternative references will eventually need to be supported
alt_reference =''

today = generate_utc_date()

# Load existing data if any (primarily if script crashes and has to be rerun)
authors_list = read_dicts_from_csv(file_path + 'authors.csv')
author_strings_list = read_dicts_from_csv(file_path + 'author_strings.csv')

# Open the file containing the stored data about authors retrieved from CrossRef
stored_retrieved_authors = read_dicts_from_csv(file_path + 'stored_retrieved_authors.csv')

# Open the article items file after upload in order to get the Q IDs for the newly written articles
articles = read_dicts_from_csv(file_path + 'articles.csv')

for article in articles:
    qid = article['qid']
    doi = article['doi']
    print(qid, doi)
    pmid = article['pmid']
    
    found = False
    for article_authors in stored_retrieved_authors:
        if article['doi'] == article_authors['doi']:
            found = True
            authors = json.loads(article_authors['authors'])
            break
    if found:
        # Disambiguate authors against existing Wikidata people items
        found_author_qids, author_name_strings = disambiguate_authors(doi, authors, pmid)

        for author in found_author_qids:
            out_dict = {}
            out_dict['qid'] = qid
            out_dict['label_en'] = article['label_en']
            out_dict['author_uuid'] = ''
            out_dict['author'] = author['qid']
            out_dict['author_series_ordinal'] = author['series_ordinal']
            out_dict['author_stated_as'] = author['stated_as']
            out_dict['author_ref1_hash'] = ''
            if alt_reference == '':
                out_dict['author_ref1_referenceUrl'] = 'http://doi.org/' + doi
            else:
                out_dict['author_ref1_referenceUrl'] = alt_reference
            out_dict['author_ref1_retrieved_nodeId'] = ''
            out_dict['author_ref1_retrieved_val'] = today
            out_dict['author_ref1_retrieved_prec'] = ''
            authors_list.append(out_dict)
        #print(authors_list)
        
        if len(authors_list) > 0:
            fieldnames = list(authors_list[0].keys()) 
            write_dicts_to_csv(authors_list, file_path + 'authors.csv', fieldnames)

        for author in author_name_strings:
            out_dict = {}
            out_dict['qid'] = qid
            out_dict['label_en'] = article['label_en']
            out_dict['author_string_uuid'] = ''
            out_dict['author_string'] = author['author_string']
            out_dict['author_string_series_ordinal'] = author['series_ordinal']
            out_dict['author_string_ref1_hash'] = ''
            if alt_reference == '':
                out_dict['author_string_ref1_referenceUrl'] = 'http://doi.org/' + doi
            else:
                out_dict['author_string_ref1_referenceUrl'] = alt_reference
            out_dict['author_string_ref1_retrieved_nodeId'] = ''
            out_dict['author_string_ref1_retrieved_val'] = today
            out_dict['author_string_ref1_retrieved_prec'] = ''
            author_strings_list.append(out_dict)

        #print(author_strings_list)
        if len(author_strings_list) > 0:
            fieldnames = list(author_strings_list[0].keys()) 
            write_dicts_to_csv(author_strings_list, file_path + 'author_strings.csv', fieldnames)

print('done')


 10.1371/JOURNAL.PBIO.3001417
retrieved data from PubMed ID 34699520
1
Searching Wikidata for Asia K. Miller
researcher known affiliations:  ['Vanderbilt University, Department of Biological Sciences, Nashville, Tennessee, United States of America.']

Wikidata search fuzzy match: 86 Asia K. Miller  /  https://www.wikidata.org/wiki/Q3600368 A. Miller
Wikidata description:  British association football player
occupations: ['association football player']
employers: []
affilliations []


Wikidata search fuzzy match: 86 Asia K. Miller  /  https://www.wikidata.org/wiki/Q89611330 A Miller
Wikidata description:  researcher
occupations: ['researcher']
employers: ['Staffordshire University']
affilliations []

Checking article, PMID: 29231092 Test-retest reliability of the irrational performance beliefs inventory.
Affiliation test:  56 a School of Life Sciences , Staffordshire University , Stoke on Trent , UK.


not found: Asia K. Miller

2
Searching Wikidata for Camille S. Westlake
researcher kn

# Test code

The code cells below were used to build and the production code. They don't need to be run and have been retained for historical purposes.

## Retrieve DOI data for existing work records


In [None]:
filename = 'articles.csv'
articles = read_dicts_from_csv(filename)
for article in articles[50:100]:
    if article['doi'] != '':
        crossref_results = retrieve_crossref_data(article['doi'])
        if crossref_results != {}:        
            crossref_results = extract_journal_qid(crossref_results)

if log_path != '':
    log_object.close()
print('done')

# Test of invoking PMID and Wikidata test functions

In [None]:
doi = '10.1111/rec3.12158'

doi = doi.upper()
print('pmid:', retrieve_pubmed_id(doi))
print('in Wikidata?', doi_in_wikidata(doi))

# Tests of  name searches at Wikidata


In [None]:
#name = '尼可罗·马基亚维利'
#name = 'Nicolás Maquiavelo'
name = 'Никколо Макиавелли'
#generate_name_alternatives(name)
search_name_at_wikidata(name)

In [None]:
doi = '10.1186/S13643-020-01393-8'
crossref_results = retrieve_crossref_data(doi)
for author in crossref_results['authors']:
    name = author['givenName'] + ' ' + author['familyName']
    hit = search_name_at_wikidata(name)
    print(name)
    print(hit)
    print()

# Tests of searches for names at Vanderbilt

In [None]:
filename = 'researchers.csv'
researchers = read_dicts_from_csv(filename)

filename = 'vanderbilt_wikidata_altlabels.csv'
altnames = read_dicts_from_csv(filename)

filename = 'departments.csv'
departments = read_dicts_from_csv(filename)

filename = 'department_labels.csv'
department_labels = read_dicts_from_csv(filename)

#print(researchers[0])
#print()
#print(altnames[0])
print()
print(departments[0])
print()
print(department_labels[0])


# Test query looking for coauthor matches

The idea here was that potential author Wikidata items could be tested by seeing if they were coauthors with any of the existing authors.

Tested the query but it didn't seem to ever find matches and I didn't test or debug thoroughly.

In [None]:
# Query to look for situations where one of the unlinked resarchers are coauthors of identified ones from Vanderbilt
print('Looking for potential coauthor matches')
query_string = '''
select distinct ?coauthor ?label where {
  VALUES ?researcher
  {
  ''' + found_qid_values + '''}
?publication wdt:P50 ?researcher.
?publication wdt:P50 ?coauthor.
?coauthor rdfs:label ?label.
FILTER(lang(?label)='en')
FILTER(?researcher != ?coauthor)
  }
'''
#print(query_string)
results = send_sparql_query(query_string)
for author in not_found_author_list:
    print(author['givenName'] + ' ' + author['familyName'])
    for result in results:
        w_ratio = fuzz.WRatio(author['givenName'] + ' ' + author['familyName'], result['label'])
        if w_ratio > 90:
            print('fuzzy match: ' + str(w_ratio) + ' ' + result['coauthor'] + ' ' + result['label'] + ' / ' + author['givenName'] + ' ' + author['familyName'])


# Data used in development of Q ID-screening function

The hard-coded screens below can be used with the test Q IDs to screen out "George Washington"s of different sorts. They are an alternative to loading the screens via:

```
screens = load_json_into_data_struct('screens.json')
```

In [None]:
qids = ['Q586680', 'Q1406257', 'Q1508517', 'Q10288976', 'Q20539851', 'Q23', 'Q79483233', 'Q103915646', 'Q5545912']

screens1 = [
    [
        {
            'property': 'P31',
            'entity': 'Q5', # use empty string if any value is allowed, or if filtering value strings
            'lang': '',
            'position': 'object',
            'require': 'exclude', # options: include, exclude
            'filter_type': '', # options: in, <, >
            'filter_string': ''
        },
        {
            'property': 'P170',
            'entity': '',
            'lang': '',
            'position': 'subject',
            'require': 'exclude', # options: include, exclude
            'filter_type': '', # options: in, <, >
            'filter_string': ''
        }
    ],[
        {
            'property': 'description',
            'entity': 'American jazz trombonist',
            'lang': 'en',
            'position': 'object',
            'require': 'include', # options: include, exclude
            'filter_type': '', # options: in, <, >
            'filter_string': ''
        }
    ]
]

screens2 = [
    [
        {
            'property': 'P31',
            'entity': 'Q3305213', # paintings
            'lang': '',
            'position': 'object',
            'require': 'include', # options: include, exclude
            'filter_type': '', # options: in, <, >
            'filter_string': ''
        }
    ],[
        {
            'property': 'P31',
            'entity': 'Q179700', # sculptures
            'lang': '',
            'position': 'object',
            'require': 'include', # options: include, exclude
            'filter_type': '', # options: in, <, >
            'filter_string': ''
        }
    ]
]

screens = load_json_into_data_struct('screens.json')
return_list = screen_qids(qids, screens)
print(json.dumps(return_list, indent=2))

NCBI ID converter
```
https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?ids=10.1016/j.ajhg.2010.04.006&format=json
```

In [None]:
first_name = ''
last_name = ''
if (first_name + ' ' + last_name).strip() == '':
    print('nothing')