In [None]:
# publoader.ipynb, a Python script for uploading files and data to Wikimedia Commons using the API.
# version 0.0.1

# (c) 2022 Vanderbilt University. This program is released under a GNU General Public License v3.0 http://www.gnu.org/licenses/gpl-3.0
# Author: Steve Baskauf

# ----------------
# Module imports
# ----------------

import yaml
import sys
#import csv
from datetime import datetime
from time import sleep
import json
import pandas as pd
import requests
import requests_cache
from fuzzywuzzy import fuzz # fuzzy logic matching
from langdetect import detect_langs
import re # regex
import logging # See https://docs.python.org/3/howto/logging.html

# module located in same directory as script
import mapping_functions

# Set up cache for HTTP requests
requests_cache.install_cache('wqs_cache', backend='sqlite', expire_after=300, allowable_methods=['GET', 'POST'])

# ------------------------
# Utility functions
# ------------------------

def calculate_todays_date():
    """Generate the current UTC xsd:date"""
    whole_time_string_z = datetime.utcnow().isoformat() # form: 2019-12-05T15:35:04.959311
    date_z = whole_time_string_z.split('T')[0] # form 2019-12-05
    return date_z

def extract_local_name(iri):
    """Extract the local name part of an IRI, e.g. a Q ID from a Wikidata IRI"""
    # pattern is http://www.wikidata.org/entity/Q6386232
    pieces = iri.split('/')
    return pieces[-1]

def include_reference_url(url, full_works):
    """Returned strings are suitable to use for references. Currently it's assumed that the criteria are the
    same for full work available."""
    url_pattern = "^https?:\\/\\/(?:www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b(?:[-a-zA-Z0-9()@:%_\\+.~#?&\\/=]*)$"
    url_inclusion_strings = [
    'doi',
    'jstor',
    #'oxfordjournals.org/content',
    'article',
    'academia.edu',
    'content',
    'proquest.com/docview',
    'handle'
    ]
    
    url_exclusion_strings = [
    'login',
    'proxy',
    #'search.proquest.com',
    'worldcat',
    'wp-content',
    'site.ebrary.com',
    'cro3.org/',
    'worldbookonline.com/pl/infofinder'
    ]

    url = url.lower() # convert to all lowercase
    
    # Exclude invalid URLs
    if re.match(url_pattern, url) is None:
        return ''

    # If the URL matches one of the pre-screened URLs, use it
    matched_series = full_works.loc[full_works['Url']==url, 'Url']
    # matched_series will be a Series composed of all values in the Url column that match. There should be 1 or 0.
    if len(matched_series) == 1:
        return url
    
    # Exclude any URLs containing strings that indicate a login is required
    for screening_string in url_exclusion_strings:
        if screening_string in url:
            return ''
        
    # Must contain one of the strings that indicate metadata and possible acces
    for screening_string in url_inclusion_strings:
        if screening_string in url:
            return url
        
    return ''

def set_description(string, work_types):
    """Match the type string with possible types for the data source and return the description."""
    if string == '':
        return ''
    
    for work_type in work_types:
        if string == work_type['type_string']:
            return work_type['description']

    print('Did not find datatype for type:', string)
    logging.warning('Did not find datatype for type: ' + string)
    return ''

def title_if_no_lowercase(string):
    """Change to titlecase only if there are no lowercase letters in the string."""
    lower = 'abcdefghijklmnopqrstuvwxyz'
    is_lower = False
    for letter in string:
        if letter in lower:
            is_lower = True
    if is_lower:
        return string
    else:
        return string.title()

def fix_all_caps(name_pieces):
    """Input is a list of name strings from name split by spaces"""
    clean_pieces = []
    for piece in name_pieces:
        # Special handing for names starting with apostrophe-based prefixes
        apostrophe_list = ["van't", "'t", "O'", "D'", "d'", "N'"]
        apostrophe_prefix = ''
        for possible_apostrophe_prefix in apostrophe_list:
            if possible_apostrophe_prefix in piece:
                # Remove prefix
                piece = piece.replace(possible_apostrophe_prefix, '')
                apostrophe_prefix = possible_apostrophe_prefix
        
        # Special handling for name parts that are lowercase
        lower_case_list = ['von', 'de', 'van', 'la', 'der']
        if piece.lower() in lower_case_list:
            piece = piece.lower()
        else:
            # Special handling for hyphenated names; doesn't work for an edge case with more than 2 hyphens
            if '-' in piece:
                halves = piece.split('-')
                piece = title_if_no_lowercase(halves[0]) + '-' + title_if_no_lowercase(halves[1])
            else:
                piece = title_if_no_lowercase(piece)
        
        # put any apostrophe prefix back on the front
        if apostrophe_prefix:
            piece = apostrophe_prefix + piece
        
        clean_pieces.append(piece)
    return clean_pieces
  
def extract_name_pieces(name):
    """Extract parts of names. Recognize typical male suffixes. Fix ALL CAPS if present."""
    # treat commas as if they were spaces
    name = name.replace(',', ' ')
    # get rid of periods, sometimes periods are close up with no spaces
    name = name.replace('.', ' ')

    pieces = name.split(' ')
    while '' in pieces:
        pieces.remove('')
    
    # Remove ", Jr.", "III", etc. from end of name
    if pieces[len(pieces)-1] == 'Jr':
        pieces = pieces[0:len(pieces)-1]
        suffix = ', Jr.'
    elif pieces[len(pieces)-1] == 'II':
        pieces = pieces[0:len(pieces)-1]
        suffix = ' II'
    elif pieces[len(pieces)-1] == 'III':
        pieces = pieces[0:len(pieces)-1]
        suffix = ' III'
    elif pieces[len(pieces)-1] == 'IV':
        pieces = pieces[0:len(pieces)-1]
        suffix = ' IV'
    elif pieces[len(pieces)-1] == 'V':
        pieces = pieces[0:len(pieces)-1]
        suffix = ' V'
    elif len(pieces) > 3 and pieces[len(pieces)-2] == 'the' and pieces[len(pieces)-1] == 'elder':
        pieces = pieces[0:len(pieces)-2]
        suffix = ' the elder'
    else:
        suffix = ''
        
    # Fix stupid situation where name is written in ALL CAPS
    pieces = fix_all_caps(pieces)
    return pieces, suffix
    
def extract_names_from_list(names_string, settings):
    """Extract multiple authors from a character-separated list in a single string."""
    if names_string == '':
        return []
    
    names_list = names_string.split(settings['names_separator'])
    
    output_list = []
    # If names are last name first
    if settings['name_part_separator']:
        for name in names_list:
            pieces = name.split(settings['name_part_separator'])
            # Keep removing empty strings until there aren't any more
            while '' in pieces:
                pieces.remove('')
            if len(pieces) == 1: # an error, name wasn't reversed
                print('Name error:', names_string)
                logging.warning('Name error: ' + names_string)
                surname_pieces = []
                given_pieces = []
                suffix = ''
            elif len(pieces) == 2: # no Jr.
                surname_pieces, suffix = extract_name_pieces(pieces[0].strip())
                given_pieces, dummy = extract_name_pieces(pieces[1].strip())
            elif len(pieces) == 3: # has Jr.
                # Note Jr. is handled inconsistently, sometimes placed after entire name, sometimes after surname
                if 'Jr' in pieces[2]:
                    surname_pieces, suffix = extract_name_pieces(pieces[0].strip() + ', ' + pieces[2].strip())
                    given_pieces, dummy = extract_name_pieces(pieces[1].strip())
                else:
                    surname_pieces, suffix = extract_name_pieces(pieces[0].strip() + ', ' + pieces[1].strip())
                    given_pieces, dummy = extract_name_pieces(pieces[2].strip())                    
            else:
                print('Name error:', names_string)
                logging.warning('Name error: ' + names_string)
                surname_pieces = []
                given_pieces = []
                suffix = ''
                
            surname = ' '.join(surname_pieces)
            given = ' '.join(given_pieces)
            output_list.append({'orcid': '', 'givenName': given, 'familyName': surname, 'suffix': suffix, 'affiliation': []})
    else:
        pass # need to write code for case where they aren't reversed
        
    
    return output_list

def extract_identifier_from_extra(extra_field, id_name):
    """Extract an identifier from the Zotero export from the Extra field. May contain the DOI of book 
    sections or PMID of articles."""
    identifier = ''
    tokens = extra_field.split(' ')
    for token_index in range(len(tokens)):
        if tokens[token_index] == id_name + ':': # match the tag for the desired ID
            # The identifer is the next token after the tag
            identifier = tokens[token_index + 1]
            break
    return identifier

def search_name_at_wikidata(name, user_agent):
    """Carry out a search for most languages that use Latin characters, plus some other commonly used 
    languages. See https://doi.org/10.1145/3233391.3233965 for reference."""
    language_codes = [
        'en',
        'es',
        'pt',
        'fr',
        'it',
        'nl',
        'de',
        'da',
        'et',
        'hu',
        'ga',
        'ro',
        'sk',
        'sl',
        'zu',
        'tr',
        'sv',
        'zh',
        'ru',
        'ja',
        'ar',
        'pl',
        'uk',
        'ca',
        'cs',
        'la',
        'nb',
        'hu',
        'he',
        'eo',
        'fi',
        'ko'
      ]
    name_list = generate_name_alternatives(name)
    alternatives = ''
    for language_code in language_codes:
        for alternative in name_list:
            # get rid of quotes, which will break the query
            alternative = alternative.replace('"', '')
            alternative = alternative.replace("'", '')
            alternatives += '"' + alternative + '"@' + language_code + '\n'
    query = '''
select distinct ?item ?label where {
  VALUES ?value
  {
  ''' + alternatives + '''}
?item rdfs:label|skos:altLabel ?value.
?item rdfs:label ?label.
FILTER(lang(?label)='en')
  }
'''
    #print(query)
    #print('searching for ', name)
    
    wdqs = Sparqler(useragent=user_agent)
    statements = wdqs.query(query)
    sleep(settings['sparql_sleep'])

    results = []
    for statement in statements:
        wikidata_iri = statement['item']['value']
        if 'label' in statement:
            name = statement['label']['value']
        else:
            name = ''
        qnumber = extract_local_name(wikidata_iri)
        results.append({'qid': qnumber, 'name': name})
    return results

# returns lists of occupations, employers, and affiliations for a person with Wikidata ID qid
def search_wikidata_occ_emp_aff(qid, default_language, user_agent):
    """Search Wikidata for occupation, employer, and affiliation claims. Return a 
    (occupation, employer, affiliation) tuple of lists."""
    results_list = []

    query_string = '''select distinct ?occupation ?employer ?affiliation where {
        optional {
            wd:'''+ qid + ''' wdt:P106 ?occupationId.
            ?occupationId rdfs:label ?occupation.
            FILTER(lang(?occupation) = "'''+ default_language + '''")            
            }

        optional {
            wd:'''+ qid + ''' wdt:P108 ?employerId.
            ?employerId rdfs:label ?employer.
            FILTER(lang(?employer) = "'''+ default_language + '''")            
            }

        optional {
            wd:'''+ qid + ''' wdt:P1416 ?affiliationId.
            ?affiliationId rdfs:label ?affiliation.
            FILTER(lang(?affiliation) = "'''+ default_language + '''")            
            }
        }'''
    #print(query_string)
    
    wdqs = Sparqler(useragent=user_agent)
    statements = wdqs.query(query_string)
    sleep(settings['sparql_sleep'])
    #print(statements)
    
    # pull all possible occupations
    occupationList = []
    employerList = []
    affiliationList = []
    for statement in statements:
        if 'occupation' in statement:
            occupationList.append(statement['occupation']['value'])
        if 'employer' in statement:
            employerList.append(statement['employer']['value'])
        if 'affiliation' in statement:
            affiliationList.append(statement['affiliation']['value'])
    occupationList = list(set(occupationList))
    employerList = list(set(employerList))
    affiliationList = list(set(affiliationList))
    #print(occupationList)
    #print(employerList)
    #print(affiliationList)
    
    return occupationList, employerList, affiliationList 


def find_surname_givens(name):
    """Find the surname and given names (as a single string) from a name string. Remove typical male suffixes."""
    # Get rid of periods and commas
    name = name.replace('.', ' ')
    name = name.replace(',', ' ')
    
    # Split name
    pieces = name.split(' ')
    
    # Get rid of empty pieces formed from extra spaces
    while '' in pieces:
        pieces.remove('')
        
    # Must be at least a surname and something else
    if len(pieces) <= 1:
        return False
    
    # Make sure first character is alphabetic
    # only fixes the case where there is one alphanumeric, but more than one is rare
    # typical cases are like (Kit) or "Kit"    
    for piece_index in range(len(pieces)):
        if not pieces[piece_index][0:1].isalpha(): 
            pieces[piece_index] = pieces[piece_index][1:len(pieces)] # remove the first non-alphabetic character
    # Now get rid of any empty strings; could also be caused by double spaces
    for piece in pieces:
        if len(piece) == 0: # there's nothing left, get rid of piece
            pieces.remove('')
            
    # Get rid of ", Jr.", "III", etc.
    if 'Jr' in pieces:
        pieces.remove('Jr')
    if 'Sr' in pieces:
        pieces.remove('Sr')
    if 'II' in pieces:
        pieces.remove('II')
    if 'III' in pieces:
        pieces.remove('III')
    if 'IV' in pieces:
        pieces.remove('IV')
    if 'V' in pieces:
        pieces.remove('V')
    
    # Not interested unless there are at least two pieces
    if len(pieces) == 1:
        return False
    
    # Put all but last piece together again
    given_names = ''
    for piece in pieces[0:len(pieces)-2]:
        given_names += piece + ' '
    given_names += pieces[len(pieces)-2]
    
    return {'given': given_names, 'family': pieces[len(pieces)-1]}

def generate_name_alternatives(name):
    """Generate many permutations of names and initials, with and without periods, to be queried
    against Wikidata labels and aliases."""
    # treat commas as if they were spaces
    name = name.replace(',', ' ')
    # get rid of periods, sometimes periods are close up with no spaces
    name = name.replace('.', ' ')

    pieces = name.split(' ')
    while '' in pieces:
        pieces.remove('')
    
    # Remove ", Jr.", "III", etc. from end of name
    if pieces[len(pieces)-1] == 'Jr':
        pieces = pieces[0:len(pieces)-1]
        suffix = ', Jr.'
    elif pieces[len(pieces)-1] == 'II':
        pieces = pieces[0:len(pieces)-1]
        suffix = ' II'
    elif pieces[len(pieces)-1] == 'III':
        pieces = pieces[0:len(pieces)-1]
        suffix = ' III'
    elif pieces[len(pieces)-1] == 'IV':
        pieces = pieces[0:len(pieces)-1]
        suffix = ' IV'
    elif pieces[len(pieces)-1] == 'V':
        pieces = pieces[0:len(pieces)-1]
        suffix = ' V'
    elif len(pieces) > 3 and pieces[len(pieces)-2] == 'the' and pieces[len(pieces)-1] == 'elder':
        pieces = pieces[0:len(pieces)-2]
        suffix = ' the elder'
    else:
        suffix = ''
        
    # Fix stupid situation where name is written in ALL CAPS
    pieces = fix_all_caps(pieces)        

    # generate initials for all names
    initials = []
    for piece in pieces:
        # make sure first character is alphabetic
        # only fixes the case where there is one alphanumeric, but more than one is rare
        # typical cases are like (Kit) or "Kit"
        if not piece[0:1].isalpha():
            piece = piece[1:len(piece)] # remove the first non-alphabetic character
        if len(piece) > 0:
            initials.append(piece[0:1])
        
    alternatives = []
    # full name
    name_version = ''
    for piece_number in range(0, len(pieces)-1):
        name_version += pieces[piece_number] + ' '
    name_version += pieces[len(pieces)-1]
    alternatives.append(name_version)
    
    # full name with suffix
    if suffix != '':
        name_version = ''
        for piece_number in range(0, len(pieces)-1):
            name_version += pieces[piece_number] + ' '
        name_version += pieces[len(pieces)-1] + suffix
        alternatives.append(name_version)
    
    # first and last name with initials
    name_version = pieces[0] + ' '
    for piece_number in range(1, len(pieces)-1):
        name_version += initials[piece_number] + ' '
    name_version += pieces[len(pieces)-1]
    alternatives.append(name_version)
    
    # first and last name with initials and periods
    name_version = pieces[0] + ' '
    for piece_number in range(1, len(pieces)-1):
        name_version += initials[piece_number] + '. '
    name_version += pieces[len(pieces)-1]
    alternatives.append(name_version)

    # first and last name only
    name_version = pieces[0] + ' '
    name_version += pieces[len(pieces)-1]
    alternatives.append(name_version)

    # first initial and last name only
    name_version = initials[0] + ' '
    name_version += pieces[len(pieces)-1]
    alternatives.append(name_version)

    # first initial with period and last name only
    name_version = initials[0] + '. '
    name_version += pieces[len(pieces)-1]
    alternatives.append(name_version)

    # first initial no period and all other names
    name_version = initials[0] + ' '
    for piece_number in range(1, len(pieces)-1):
        name_version += pieces[piece_number] + ' '
    name_version += pieces[len(pieces)-1]
    alternatives.append(name_version)

    # first initial with period and all other names
    name_version = initials[0] + '. '
    for piece_number in range(1, len(pieces)-1):
        name_version += pieces[piece_number] + ' '
    name_version += pieces[len(pieces)-1]
    alternatives.append(name_version)

    # all name initials with last name
    name_version = initials[0] + ' '
    for piece_number in range(1, len(pieces)-1):
        name_version += initials[piece_number] + ' '
    name_version += pieces[len(pieces)-1]
    alternatives.append(name_version)

    # all name initials with periods with last name
    name_version = ''
    for piece_number in range(0, len(pieces)-1):
        name_version += initials[piece_number] + '. '
    name_version += pieces[len(pieces)-1]
    alternatives.append(name_version)

    # all name initials concatenated with last name
    name_version = ''
    for piece_number in range(0, len(pieces)-1):
        name_version += initials[piece_number]
    name_version += ' ' + pieces[len(pieces)-1]
    alternatives.append(name_version)
    
    # remove duplicates
    dedupe = list(set(alternatives))

    return dedupe

def screen_qids(qids, screens, default_language, user_agent):
    """Screen Q IDs based on criteria saved in a configuration file (probably called screens.yaml . 
    Return a list of dicts containing Q ID, label, and description."""
    qid_values =''
    for qid in qids:
        qid_values += 'wd:' + qid + '\n'

    graph_pattern = ''
    first_screen = True
    for screen in screens:
        # Each requirement in a screen has an AND relationship (all must be satisfied)
        subgraph_pattern = ''
        for requirement in screen:

            # Set the value if required or use a dummy variable if any value is allowed
            if requirement['entity'] is None:
                value = '?var' + requirement['property'] # add the property string to the variable to guarantee uniqueness
            elif re.match(r'Q\d+', requirement['entity']): # regex to match Q IDs
                value = 'wd:' + requirement['entity']
            else: # if not nothing or a Q ID, assume it's a string literal
                if requirement['lang'] is None:
                    value = '"' + requirement['entity'] + '"'
                else:
                    value = '"' + requirement['entity'] + '"@' + requirement['lang']

            # Set the property (label, description, or P value)
            if requirement['property'] == 'label':
                property = 'rdfs:label'
            elif requirement['property'] == 'description':
                property = 'schema:description'
            else:
                property = 'wdt:' + requirement['property']

            # Place the value in either the subject or object position in the triple
            if requirement['position'] == 'object':
                triple_pattern = '?qid ' + property + ' ' + value + '.'
            else:
                triple_pattern = value + ' ' + property + ' ?qid.'

            # Add filters if needed
            if requirement['filter_type'] == '<' or requirement['filter_type'] == '>': 
                # note: string comparison only e.g. for datetimes, needs modification for actual numbers
                triple_pattern += '\nFILTER (STR(?var' + requirement['property'] + ') ' + requirement['filter_type'] + ' "' + requirement['filter_string'] + '")'

            if requirement['filter_type'] == 'in': 
                # note: string comparison only
                triple_pattern += '\nFILTER (CONTAINS(?var' + requirement['property'] + ', "' + requirement['filter_string'] + '"))'

            # Use MINUS if you want to exclude items that fit the pattern.
            if requirement['require'] == 'exclude':
                triple_pattern = 'minus {' + triple_pattern + '}'

            triple_pattern += '\n'
            #print(triple_pattern)
            subgraph_pattern += triple_pattern

        # Now attach the subgraph pattern to any previous subgraph patterns using UNION to great an OR relationship
        subgraph_pattern = '{\n' + subgraph_pattern + '}\n' # create a subgraph pattern so that several can be UNIONed
        if first_screen: # The first subgraph pattern doesn't need the UNION inserted
            first_screen = False
        else:
            graph_pattern = graph_pattern + 'UNION\n'
        graph_pattern += subgraph_pattern 

    query_string = '''
    select distinct ?qid ?label ?description where {
      VALUES ?qid
      {
      ''' + qid_values + '''}
    ''' + graph_pattern + '''
    
    ?qid rdfs:label ?label.
    FILTER(lang(?label)="'''+ default_language + '''")
    
    OPTIONAL {
    ?qid schema:description ?description.
    FILTER(lang(?description)="'''+ default_language + '''")
    }
      }
    '''
    #print(query_string)
    
    wdqs = Sparqler(useragent=user_agent)
    results = wdqs.query(query_string)
    sleep(settings['sparql_sleep'])

    return_list = []
    for result in results:
        out_dict = {
            'qid': extract_local_name(result['qid']['value']),
            'label': result['label']['value']
            }
        if 'description' in result:
            out_dict['description'] = result['description']['value']
        else:
            out_dict['description'] = ''           
        return_list.append(out_dict)
    return return_list

def work_in_wikidata_status(label, doi, pmid, existing_works_df, settings, verbose=False):
    """Search for a work in the list of pre-existing Wikidata items by DOI, PubMed ID, and label. 
    If a fuzzy match has a high score, accept as a match. For intermediate range scores, flag as
    a case where one label is a subtitle of another."""
    if doi and doi.upper() in list(existing_works_df.loc[:, 'doi']):
        if verbose:
            print('DOI found in existing works')
        return 'found DOI'
    elif pmid and pmid in list(existing_works_df.loc[:, 'pmid']):
        if verbose:
            print('PubMed ID found in existing works')
        return 'found PubMed ID'
    else:
        # NOTE: although calculating the fuzz.WRatio is labor intensive, these checks must be done
        # sequentially, since we don't want the search for a nearly exact match to be stopped if a
        # stupid partial match is found first.
        for index, work in existing_works_df.iterrows():
            w_ratio = fuzz.WRatio(work['label'], label)

            # Test for nearly exact title match
            if w_ratio > settings['existing_work_fuzzy_match_cutoff']:
                if verbose:
                    print('fuzzy label match: ' + str(w_ratio))
                    print('test:', label)
                    print('wikidata:', work['label'])
                return 'fuzzy label match'
                
        for index, work in existing_works_df.iterrows():
            w_ratio = fuzz.WRatio(work['label'], label)
            # Test for meaningful subtitle match
            if w_ratio > settings['existing_work_subtitle_fuzzy_match_cutoff']:
                if verbose:
                    print('Warning!!! Possible partial title: ' + str(w_ratio))
                    print('test:', label)
                    print('wikidata:', work['label'])
                    logging.warning('Possible partial title: ' + str(w_ratio) + ' match, ' + extract_local_name(work['qid']) + ' ' + work['label'])
                return 'possible partial title'
        
    if verbose:
        print('Not found')
    return 'not found'

def find_containing_book(isbn, label, default_language, user_agent):
    """Search for a book by ISBN or label. Handles minimal error conditions."""
    book_error = False
    found = False
    
    if isbn != '':
        query_string = '''SELECT DISTINCT ?item
    WHERE 
    {
      BIND ("''' + isbn + '''" AS ?isbn)
      {?item wdt:P212 ?isbn.} # ISBN-13
      union
      {?item wdt:P957 ?isbn.} # ISBN-10
    }
    '''
        wdqs = Sparqler(useragent=user_agent)
        results = wdqs.query(query_string)
        sleep(settings['sparql_sleep'])

        if len(results) > 1:
            print('More than one book matches the ISBN')
            logging.warning('More than one book matches the ISBN')
            book_error = True
            found = True
            book_qid = ''
        elif len(results) == 1:
            found = True
            book_qid = extract_local_name(results[0]['item']['value'])
            
    if not found:
        query_string = '''SELECT DISTINCT ?item ?type WHERE {
?item rdfs:label "''' + label + '"@' + default_language + '''.
?item wdt:P31 ?type.
}'''
        wdqs = Sparqler(useragent=user_agent)
        results = wdqs.query(query_string)
        sleep(settings['sparql_sleep'])
        if (results is None) or (len(results) == 0):
            book_qid = ''
        else:
            type_list = [extract_local_name(result['type']['value']) for result in results]
            qids_list = list(set([extract_local_name(result['item']['value']) for result in results]))
            if len(qids_list) > 1:
                book_error = True
                print('Label for published_in matches multiple items:', qids_list)
                logging.warning('Label for published_in matches multiple items: ' + str(qids_list))
                book_qid = ''
            else:
                result_qid = qids_list[0]
                if 'Q1711593' in type_list: # edited volume
                    book_qid = result_qid
                else:
                    book_error = True
                    print('Possible published_in', result_qid, 'not edited volume but has types', type_list)
                    logging.warning('Possible published_in ' + result_qid + ' not edited volume but has types ' + str(type_list))
                    book_qid = ''
    return book_error, book_qid

# ------------------------
# SPARQL query class
# ------------------------

# This is a condensed version of the more full-featured script at 
# https://github.com/HeardLibrary/digital-scholarship/blob/master/code/wikidata/sparqler.py
# It includes only the method for the query form.

class Sparqler:
    """Build SPARQL queries of various sorts

    Parameters
    -----------
    useragent : str
        Required if using the Wikidata Query Service, otherwise optional.
        Use the form: appname/v.v (URL; mailto:email@domain.com)
        See https://meta.wikimedia.org/wiki/User-Agent_policy
    endpoint: URL
        Defaults to Wikidata Query Service if not provided.
    method: str
        Possible values are "post" (default) or "get". Use "get" if read-only query endpoint.
        Must be "post" for update endpoint.
    sleep: float
        Number of seconds to wait between queries. Defaults to 0.1
        
    Required modules:
    -------------
    requests, datetime, time
    """
    def __init__(self, method='post', endpoint='https://query.wikidata.org/sparql', useragent=None, sleep=0.1):
        # attributes for all methods
        self.http_method = method
        self.endpoint = endpoint
        if useragent is None:
            if self.endpoint == 'https://query.wikidata.org/sparql':
                print('You must provide a value for the useragent argument when using the Wikidata Query Service.')
                print()
                raise KeyboardInterrupt # Use keyboard interrupt instead of sys.exit() because it works in Jupyter notebooks
        self.sleep = sleep

        self.requestheader = {}
        if useragent:
            self.requestheader['User-Agent'] = useragent
        
        if self.http_method == 'post':
            self.requestheader['Content-Type'] = 'application/x-www-form-urlencoded'

    def query(self, query_string, form='select', verbose=False, **kwargs):
        """Send a SPARQL query to the endpoint.
        
        Parameters
        ----------
        form : str
            The SPARQL query form.
            Possible values are: "select" (default), "ask", "construct", and "describe".
        mediatype: str
            The response media type (MIME type) of the query results.
            Some possible values for "select" and "ask" are: "application/sparql-results+json" (default) and "application/sparql-results+xml".
            Some possible values for "construct" and "describe" are: "text/turtle" (default) and "application/rdf+xml".
            See https://docs.aws.amazon.com/neptune/latest/userguide/sparql-media-type-support.html#sparql-serialization-formats-neptune-output
            for response serializations supported by Neptune.
        verbose: bool
            Prints status when True. Defaults to False.
        default: list of str
            The graphs to be merged to form the default graph. List items must be URIs in string form.
            If omitted, no graphs will be specified and default graph composition will be controlled by FROM clauses
            in the query itself. 
            See https://www.w3.org/TR/sparql11-query/#namedGraphs and https://www.w3.org/TR/sparql11-protocol/#dataset
            for details.
        named: list of str
            Graphs that may be specified by IRI in a query. List items must be URIs in string form.
            If omitted, named graphs will be specified by FROM NAMED clauses in the query itself.
            
        Returns
        -------
        If the form is "select" and mediatype is "application/json", a list of dictionaries containing the data.
        If the form is "ask" and mediatype is "application/json", a boolean is returned.
        If the mediatype is "application/json" and an error occurs, None is returned.
        For other forms and mediatypes, the raw output is returned.

        Notes
        -----
        To get UTF-8 text in the SPARQL queries to work properly, send URL-encoded text rather than raw text.
        That is done automatically by the requests module for GET. I guess it also does it for POST when the
        data are sent as a dict with the urlencoded header. 
        See SPARQL 1.1 protocol notes at https://www.w3.org/TR/sparql11-protocol/#query-operation        
        """
        query_form = form
        if 'mediatype' in kwargs:
            media_type = kwargs['mediatype']
        else:
            if query_form == 'construct' or query_form == 'describe':
            #if query_form == 'construct':
                media_type = 'text/turtle'
            else:
                media_type = 'application/sparql-results+json' # default for SELECT and ASK query forms
        self.requestheader['Accept'] = media_type
            
        # Build the payload dictionary (query and graph data) to be sent to the endpoint
        payload = {'query' : query_string}
        if 'default' in kwargs:
            payload['default-graph-uri'] = kwargs['default']
        
        if 'named' in kwargs:
            payload['named-graph-uri'] = kwargs['named']

        if verbose:
            print('querying SPARQL endpoint')

        start_time = datetime.now()
        if self.http_method == 'post':
            response = requests.post(self.endpoint, data=payload, headers=self.requestheader)
        else:
            response = requests.get(self.endpoint, params=payload, headers=self.requestheader)
        #print('from cache:', response.from_cache) # uncomment if you want to see if cached data are used
        elapsed_time = (datetime.now() - start_time).total_seconds()
        self.response = response.text
        sleep(self.sleep) # Throttle as a courtesy to avoid hitting the endpoint too fast.

        if verbose:
            print('done retrieving data in', int(elapsed_time), 's')

        if query_form == 'construct' or query_form == 'describe':
            return response.text
        else:
            if media_type != 'application/sparql-results+json':
                return response.text
            else:
                try:
                    data = response.json()
                except:
                    return None # Returns no value if an error. 

                if query_form == 'select':
                    # Extract the values from the response JSON
                    results = data['results']['bindings']
                else:
                    results = data['boolean'] # True or False result from ASK query 
                return results           

# ------------------------
# mapping functions
# ------------------------

def identity(value, settings):
    """Return the value argument with any leading and trailing whitespace removed."""
    return value.strip()

def set_instance_of(string, settings):
    """Match the type string with possible types for the data source and return the type Q ID."""
    global error_log_string
    if string == '':
        return ''

    for work_type in settings['work_types']:
        if string == work_type['type_string']:
            return work_type['qid']

    print('Did not find datatype for type:', string)
    error_log_string += 'Did not find datatype for type:' + string + '\n'
    return ''

def detect_language(string, settings):
    """Detect the language of the label and return the Wikidata Q ID for it."""
    global error_log_string
    if string == '':
        return ''
    try:
        lang_list = detect_langs(string)
        lang_string = str(lang_list[0])
        confidence = float(lang_string[3:])
        lang = lang_string[:2]
    except: #exceptions occur when no info to decide, e.g. numbers
        lang = 'zxx'
        confidence = float(0)
    if confidence < settings['language_precision_cutoff']:
        print('Warning: language confidence for', lang, 'below', settings['language_precision_cutoff'], ':', confidence)
        error_log_string += 'Warning: language confidence for ' + lang + ' below ' + str(settings['language_precision_cutoff']) + ': ' + str(confidence) + '\n'
    if lang in settings['language_qid']:
        return settings['language_qid'][lang]
    else:
        print('Warning: detected language', lang, 'not in list of known languages.')
        error_log_string += 'Warning: detected language ' + lang + ' not in list of known languages.\n'
        return ''

def title_en(string, settings):
    """Detect the language of the label and return the language code for it."""
    if string == '':
        return ''
    try:
        lang_list = detect_langs(string)
        lang_string = str(lang_list[0])
        confidence = float(lang_string[3:])
        lang = lang_string[:2]
    except: #exceptions occur when no info to decide, e.g. numbers
        lang = 'zxx'
        confidence = float(0)
    if lang == 'en':
        return string
    else:
        return ''

# The following function is needed by the calculate_pages function
def roman_to_decimal(numeral):
    """Convert Roman numerals to integers.
    
    Note:
    -----
    Code from https://www.geeksforgeeks.org/python-program-for-converting-roman-numerals-to-decimal-lying-between-1-to-3999/"""

    def roman_integer_value(r):
        """Return value of Roman numeral symbol.

        Note:
        -----
        Code from https://www.geeksforgeeks.org/python-program-for-converting-roman-numerals-to-decimal-lying-between-1-to-3999/"""    
        if (r == 'I'):
            return 1
        if (r == 'V'):
            return 5
        if (r == 'X'):
            return 10
        if (r == 'L'):
            return 50
        if (r == 'C'):
            return 100
        if (r == 'D'):
            return 500
        if (r == 'M'):
            return 1000
        return -1

    str = numeral.upper()
    res = 0
    i = 0

    while (i < len(str)):

        # Getting value of symbol s[i]
        s1 = roman_integer_value(str[i])
        
        # Return a negative number if error.
        if s1 < 0:
            return -1

        if (i + 1 < len(str)):

            # Getting value of symbol s[i + 1]
            s2 = roman_integer_value(str[i + 1])
            
            # Return a negative number if error.
            if s2 < 0:
                return -1

            # Comparing both values
            if (s1 >= s2):

                # Value of current symbol is greater
                # or equal to the next symbol
                res = res + s1
                i = i + 1
            else:

                # Value of current symbol is greater
                # or equal to the next symbol
                res = res + s2 - s1
                i = i + 2
        else:
            res = res + s1
            i = i + 1

    return res

def calculate_pages(range, settings):
    """Calculate the number of pages from the page range.
    
    Note
    ----
    Supports properly formatted Roman numerals and doesn't care about whitespace."""
    if range == '':
        return ''
    numbers = range.split('-')
    
    # If there is only a single number or an empty cell, return the empty string.
    if len(numbers) < 2:
        return ''
    # Edge case where it isn't a well-formed range and has multiple hyphens
    if len(numbers) > 2:
        return ''
    
    # Step through the two numbers to try to convert them from Roman numerals if not integers.
    for index, number in enumerate(numbers):
        number = number.strip()
        if not number.isnumeric():
            numbers[index] = roman_to_decimal(number)
            
            # Will return -1 error if it contains characters not valid for Roman numerals 
            if numbers[index] < 0:
                return ''
    
    number_pages = int(numbers[1]) - int(numbers[0]) + 1 # Need to add one since first page in range counts
    if number_pages < 1:
        return ''
    return str(number_pages)

def clean_doi(value, settings):
    """Turn DOI into uppercase and remove leading and trailing whitespace."""
    cleaned_value = value.upper().strip()
    return cleaned_value

def extract_pmid_from_extra(extra_field, settings):
    """Extract the PubMed ID from the Extra field in the Zotero export."""
    identifier = ''
    tokens = extra_field.split(' ')
    for token_index in range(len(tokens)):
        if tokens[token_index] == 'PMID:': # match the tag for PMID
            # The identifer is the next token after the tag
            identifier = tokens[token_index + 1]
            break
    return identifier

def disambiguate_published_in(value, settings):
    """Use the value in the ISSN column to try to find the containing work.

    Note:
    -----
    For journal articles, this performs a legitimate WQS search for the journal title using the ISSN.
    For book chapters, the ISSN column may contain the Q ID of the containing book, inserted there during
    a pre-processing step (a hack, but typically books would not have an ISSN and this column would be empty)."""

    global error_log_string
    if value == '':
        return value
    
    # The value is a Q ID and was determined during a pre-processing step (i.e. for book chapters)
    if value[0] == 'Q':
        return value

    # Look up the ISSN in Wikidata
    # Build query string
    query_string = '''select distinct ?container ?containerLabel where {
      ?container wdt:P236 "''' + value + '''".
      optional {
      ?container rdfs:label ?containerLabel.
      filter(lang(?containerLabel)="en")
      }
    }'''
    #print(query_string)

    user_agent = 'PubLoader/' + settings['script_version'] + ' (mailto:' + settings['operator_email_address'] + ')'
    wdqs = Sparqler(useragent=user_agent)
    query_results = wdqs.query(query_string)
    sleep(settings['sparql_sleep'])
    
    if len(query_results) == 0:
        return ''

    if len(query_results) > 1:
        print('Warning! More than one container in Wikidata matched the ISSN ')
        error_log_string += 'Warning! More than one container in Wikidata matched the ISSN\n'
        print(query_results, '\n')
        error_log_string += str(query_results) + '\n'

    # Extract Q ID from SPARQL query results. If there is more than one result, the last one will be used for the Q ID
    for result in query_results:
        container_qid = result['container']['value'].split('/')[-1] # extract the local name from the IRI

    return container_qid

def isbn10(string, settings):
    """Check whether the ISBN value has 10 characters or not."""
    test = string.replace('-', '')
    if len(test) == 10:
        return string
    return ''

def isbn13(string, settings):
    """Check whether the ISBN value has 13 characters or not."""
    test = string.replace('-', '')
    if len(test) == 13:
        return string
    return ''

def disambiguate_publisher(name_string, settings, publishers):
    """Look up the publisher Q ID from a list derived from a SPARQL query https://w.wiki/4pbi"""
    # Set publisher Q ID to empty string if there's no publisher string
    global error_log_string
    if name_string == '':
        return ''
    
    best_match_score = 0
    best_match = ''
    best_match_label = ''
    for qid, publisher in publishers.iterrows():
        w_ratio = fuzz.WRatio(name_string, publisher['label'])
        if w_ratio > best_match_score:
            best_match = qid
            best_match_label = publisher['label']
            best_match_score = w_ratio
            
    if best_match_score < 98:
        print('w_ratio:', best_match_score, 'Warning: poor match of: "' + best_match_label + '"', best_match, 'to stated publisher: "' + name_string + '"\n')
        error_log_string += 'w_ratio: ' + str(best_match_score) + ' Warning: poor match of: "' + best_match_label + '" ' + best_match + ' to stated publisher: "' + name_string + '"\n'
        
    return best_match

def disambiguate_place_of_publication(value, settings, publisher_locations):
    """Look up place of publication Q ID from a list derived from query https://w.wiki/63Ap
    If there is a single match, the Q ID is returned.
    If there are no matches, the string is returned unprocessed.
    If there are multiple matches, a dict with possible values is returned."""
    global error_log_string
    if value == '':
        return ''
    
    if 'New York' in value:
        return 'Q60'
    
    if 'New Brunswick' in value:
        return 'Q138338'
    
    if 'California' in value:
        value = value.replace('California', 'CA')
    
    if 'Calif' in value:
        value = value.replace('Calif', 'CA')
        
    if 'Massachusetts' in value:
        value = value.replace('Massachusetts', 'MA')
        
    if 'Cambridge' in value:
        if 'Cambridge, M' in value:
            return 'Q49111'
        else:
            return 'Q350'
    
    location_list = []
    for qid, location in publisher_locations.iterrows():
        if location['label'] in value:
            location_list.append({'qid': qid, 'label': location['label']})
    if len(location_list) == 0:
        error_log_string += value + ' not found in place list.\n'
        return value
    
    elif len(location_list) == 1:
        return location_list[0]['qid']
    else:
        error_log_string += 'Multiple matches found in place list.' + str(location_list) + '\n'
        return location_list

def today(settings):
    """Generate the current UTC xsd:date"""
    whole_time_string_z = datetime.utcnow().isoformat() # form: 2019-12-05T15:35:04.959311
    date_z = whole_time_string_z.split('T')[0] # form 2019-12-05
    return date_z

def set_reference(input_url, settings, full_works):
    """Screen any URL that is present in the field for suitability as the reference URL value."""
    url = include_reference_url(input_url, full_works) # Screen for suitable URLs
    if url != '':
        return url
    else:
        return ''

def set_stated_in(input_url, settings, full_works):
    """If no URL is present, set a fixed value to be used as the stated_in value."""
    url = include_reference_url(input_url, full_works) # Screen for suitable URLs
    if url == '':
        return 'Q114403967' # Vanderbilt Divinity publications database
    else:
        return ''

# ---------------------------
# Major processes functions
# ---------------------------

def build_function(function_name_string, passed_value_string, settings, data_structure):
    """Construct a function with or without arguments from text string data in the configuration file.
    settings is a list of string arguments from configuration values to be passed following the value."""
    if data_structure is not None:
        data_structure_string = ', data_structure'
    else:
        data_structure_string = ''

    if len(passed_value_string) == 0:
        expression = 'mapping_functions.' + function_name_string + "('', settings" + data_structure_string + ')'
    else:        
        # Hack for cases where the data string is enclosed in single quotes
        if passed_value_string[0] == "'" or passed_value_string[-1] == "'":
            expression = 'mapping_functions.' + function_name_string + '("""' + passed_value_string + '""", settings' + data_structure_string + ')'
        else:
            expression = 'mapping_functions.' + function_name_string + "('''" + passed_value_string + "''', settings" + data_structure_string + ')'
    output_value = eval(expression)
    return output_value

def evaluate_function(prop, work_data, settings, data_structure):
    """Evaluate a mapping function based on data from the source spreadsheet after building the function
    from the configuration string data."""
    # The mapping function may not require an argument. In that case, there's no source column.
    if 'in_col_label' in prop:
        # If the source data CSV doesn't have any column named according to mappings, the output for that
        # variable is an empty string.
        if prop['in_col_label'] in work_data:
            output_value = build_function(prop['mapping_function'], work_data[prop['in_col_label']], settings, data_structure)
            if output_value == '':
                no_value = True
            else:
                no_value = False
        else:
            output_value = ''
            no_value = True
    # Case where there's no argument passed to mapping function
    else:
        expression = 'mapping_functions.' + prop['mapping_function'] + '(settings)'
        output_value = eval(expression)
        if output_value == '':
            no_value = True
        else:
            no_value = False
    return no_value, output_value

def extract_metadata(mapping, work_data, settings):
    """Steps through fields described in the config file, maps them to columns in the source data, and
    uses processing functions to transform the input data to forms required in the output table.
    
    Parameters
    ----------
    mapping : complex structure
        Maps column headers ("out_col_label") in the destination table to column headers ("in_col_label") in the source table.
        The "mapping_function" key indicates the function used to determine the value to be used in the destination table.
    work_data : dict
        A row of data from the source data table with column headers as the keys.
    """
    out_dict = {'qid': '', 'unique_identifier': work_data[mapping['constants']['unique_identifier_column']]}
    out_dict['label_' + settings['default_language']] = work_data[mapping['constants']['label_column']]
    out_dict['description_' + settings['default_language']] = set_description(work_data[mapping['constants']['description_code_column']], settings['work_types'])

    for out_property in config['outfiles'][0]['prop_list']:
        
        # Find the mapping variable that matches the config property
        for prop in mapping['properties']:
            if prop['out_col_label'] == out_property['variable']:
                break
    
        out_field = out_property['variable']
        out_dict[out_field + '_uuid'] = ''
        
        # If a function requires some data structure for input, its mapping must include a structure_name_string
        # object whose value is a string that is the name of the data structure object needed by the function.
        # In the function, that object is the fourth argument. Functions not needing additional data will have
        # only three arguments and None will be passed into the constructor functions as the data_structure argument.
        # NOTE: the data structure must be a global variable and therefore be defined in the main script.
        if 'structure_name_string' in prop:
            data_structure = eval(prop['structure_name_string'])
        else:
            data_structure = None

        no_value, output_value = evaluate_function(prop, work_data, settings, data_structure)

        # Populate the values-related columns
        if out_property['value_type'] == 'date':
            out_dict[out_field + '_nodeId'] = ''
            out_dict[out_field + '_val'] = output_value
            out_dict[out_field + '_prec'] = ''

        elif out_property['value_type'] == 'quantity':
            out_dict[out_field + '_nodeId'] = ''
            out_dict[out_field + '_val'] = output_value
            if no_value:
                out_dict[out_field + '_unit'] = ''
            else:
                out_dict[out_field + '_unit'] = prop['quantity_unit']

        # This is not actually implemented and will generate an error if used
        elif out_property['value_type'] == 'globecoordinate':
            out_dict[out_field + '_nodeId'] = ''
            out_dict[out_field + '_val'] = output_value
            if no_value:
                out_dict[out_field + '_long'] = ''
                out_dict[out_field + '_prec'] = ''
            else:
                out_dict[out_field + '_long'] = work_data[out_field + '_long']
                out_dict[out_field + '_prec'] = work_data[out_field + '_prec']

        else:
            out_dict[out_field] = output_value

        # Populate the qualifier columns
        for qualifier in out_property['qual']:
            if no_value:
                no_qual_value = True
            else:
                # Find the mapping variable that matches the config property
                for qual in prop['qual']:
                    if qual['out_col_label'] == qualifier['variable']:
                        break

                if 'structure_name_string' in qual:
                    data_structure = eval(qual['structure_name_string'])
                    data_structure_string = ', data_structure'
                else:
                    data_structure_string = ''
                    data_structure = None
                
                # Skip reading a value from a source column if the function doesn't need input.
                if 'in_col_label' in qual:
                    expression = 'mapping_functions.' + qual['mapping_function'] + "('''" + work_data[qual['in_col_label']] + "''', settings" + data_structure_string + ')'
                    qual_output_value = eval(expression)
                    if qual_output_value == '':
                        no_qual_value = True
                    else:
                        no_qual_value = False
                else:
                    no_qual_value = False
                    expression = 'mapping_functions.' + qual['mapping_function'] + '(settings)'
                    qual_output_value = eval(expression) # If this evalutes as empty string, result is same as no_ref_value = True


            qual_field = out_field + '_' + qualifier['variable']
            # To my knowledge, dates are the only complex types used as qualifiers (no quantities or globecoordinates).
            if qualifier['value_type'] == 'date':
                out_dict[qual_field + '_nodeId'] = ''
                if no_qual_value:
                    out_dict[qual_field + '_val'] = ''
                else:
                    out_dict[qual_field + '_val'] = qual_output_value
                out_dict[qual_field + '_prec'] = ''
            else:
                if no_qual_value:
                    out_dict[qual_field] = ''
                else:
                    out_dict[qual_field] = qual_output_value
                
        # Populate the reference columns
        # There's only a hash ID column if there's at least one reference.
        if len(out_property['ref']) > 0:
            out_dict[out_field + '_ref1_hash'] = ''
            
        for reference in out_property['ref']:
            if no_value:
                no_ref_value = True
            else:
                # Find the mapping variable that matches the config property
                for ref in prop['ref']:
                    if ref['out_col_label'] == reference['variable']:
                        break

                # Some functions like today() don't need input from the source table, and therefore 
                # skip reading a value from a source column.
                if 'structure_name_string' in ref:
                    data_structure = eval(ref['structure_name_string'])
                    data_structure_string = ', data_structure'
                else:
                    data_structure_string = ''
                    data_structure = None
                
                if 'in_col_label' in ref:
                    expression = 'mapping_functions.' + ref['mapping_function'] + "('''" + work_data[ref['in_col_label']] + "''', settings" + data_structure_string + ')'
                    ref_output_value = eval(expression)
                    if ref_output_value == '':
                        no_ref_value = True
                    else:
                        no_ref_value = False
                else:
                    no_ref_value = False
                    expression = 'mapping_functions.' + ref['mapping_function'] + '(settings)'
                    ref_output_value = eval(expression) # If this evalutes as empty string, result is same as no_ref_value = True

            ref_field = out_field + '_ref1_' + reference['variable']
            # To my knowledge, dates are the only complex types used as qualifiers (no quantities or globecoordinates).
            if reference['value_type'] == 'date':
                out_dict[ref_field + '_nodeId'] = ''
                if no_ref_value:
                    out_dict[ref_field + '_val'] = ''
                else:
                    out_dict[ref_field + '_val'] = ref_output_value
                out_dict[ref_field + '_prec'] = ''
            else:
                if no_ref_value:
                    out_dict[ref_field] = ''
                else:
                    out_dict[ref_field] = ref_output_value
                    
    #print(out_dict)
    return out_dict

def disambiguate_agents(authors, pmid, coauthors, settings, user_agent):
    """Use a wide variety of data and tricks to come up with possible Wikidata matches for agent strings. 
    This includes fuzzy matching against department names and querying Wikidata labels and aliases with
    many variations of the name string.
    Returns data (Q IDs, series ordinal, stated as) that can be used for author/editor/translator statements 
    when a positive ID is made. When no positive ID is made, a list of possible matches is included for each
    author string."""

    max_pmids_to_check = 10
    # If there is a PubMed ID for the article, retrieve the author info
    if pmid != '':
        pubmed_author_info = retrieve_pubmed_data(pmid)
        print('retrieved data from PubMed ID', pmid)
        for author_index in range(len(pubmed_author_info)):
            pubmed_author_info[author_index]['name'] = pubmed_author_info[author_index]['forename'] + ' ' + pubmed_author_info[author_index]['surname']
    else:
        print('no PubMed data')

    # Augment CrossRef data with PubMed data. Typically the PubMed data is more likely to have the affiliations
    # Names are generally very similar, but vary with added or missing periods on initials and suffixes
    if pmid != '':
        for author_index in range(len(authors)):
            found = False
            crossref_name = authors[author_index]['givenName'] + ' ' + authors[author_index]['familyName']
            #print(crossref_name)
            for pubmed_author in pubmed_author_info:
                ratio = fuzz.ratio(pubmed_author['name'], crossref_name)
                #print(ratio, pubmed_author['name'])
                if ratio > 87: # had to drop down to this level because some people with missing "Jr" weren't matching
                    found = True
                    result_string = 'fuzzy label match: ' + str(ratio) + pubmed_author['name'] + ' / ' + crossref_name
                    #print(result_string)
                    break
            if not found:
                print('Did not find a match in the PubMed data for', crossref_name)
            else:
                #print(pubmed_author)
                #print(authors[author_index])

                # If there is a PubMed affiliation and no affiliation in the CrossRef data, add the PubMed affiliation
                if pubmed_author['affiliation'] != '':
                    if len(authors[author_index]['affiliation']) == 0:
                        authors[author_index]['affiliation'].append(pubmed_author['affiliation'])

                # If there is an ORCID in PubMed and no ORCID in the CrossRef data, add the ORCID to CrossRef data
                # Not sure how often this happens since I think maybe usually of one has it, the other does, too.
                if pubmed_author['orcid'] != '':
                    if authors[author_index]['orcid'] == '':
                        authors[author_index]['orcid'] = pubmed_author['orcid']

                #print(authors[author_index])

            #print()
    #print(json.dumps(pubmed_author_info, indent=2))

    # Perform screening operations on authors to try to determine their Q IDs
    found_qid_values = []
    not_found_author_list = []
    author_count = 1
    for author in authors:
        print(author_count)
        found = False
        
        # First eliminate the case where all of the name pieces are empty
        if (author['givenName'] + ' ' + author['familyName']).strip() == '':
            break
            
        # Record stated_as
        stated_as = (author['givenName'] + ' ' + author['familyName']).strip()
            
        # Fix case where names are stupidly in all caps
        name_pieces = author['givenName'].strip().split(' ')
        author['givenName'] = ' '.join(fix_all_caps(name_pieces))
        name_pieces = author['familyName'].strip().split(' ')
        author['familyName'] = ' '.join(fix_all_caps(name_pieces))
        
        # Screen for exact match to Wikidata labels
        for index, researcher in researchers.iterrows():
            if researcher['label_en'] == author['givenName'] + ' ' + author['familyName']:
                found = True
                result_string = 'researcher exact label match: ' + researcher['qid'] + ' ' + researcher['label_en']
                name = researcher['label_en']
                qid = researcher['qid']
                break
        if not found:
            # screen for exact match to alternate names
            for index, altname in altnames.iterrows():
                if altname['altLabel'] == author['givenName'] + ' ' + author['familyName']:
                    found = True
                    result_string = 'researcher altname match: ' + altname['qid'] + ' ' + altname['altLabel']
                    name = altname['altLabel']
                    qid = altname['qid']
                    break
            if not found:
                # If the researcher has an ORCID, see if it's at Wikidata
                if author['orcid'] != '':
                    hit = searchWikidataForQIdByOrcid(author['orcid'])
                    if hit != {}:
                        found = True
                        result_string = 'Wikidata ORCID search: ' + hit['qid'] + ' ' + hit['label'] + ' / ' + hit['description']
                        name = hit['label']
                        qid = hit['qid']

                if not found:
                    # screen for fuzzy match to Wikidata-derived labels
                    for index, researcher in researchers.iterrows():
                        # Require the surname to match the label surname exactly
                        split_names = find_surname_givens(researcher['label_en']) # returns False if no family name
                        if split_names: # skip names that don't have 2 parts !!! also misses non-English labels!
                            if split_names['family'] == author['familyName']: # require exact match to family name
                                w_ratio = fuzz.ratio(author['givenName'] + ' ' + author['familyName'], researcher['label_en'])
                                #w_ratio = fuzz.WRatio(author['givenName'] + ' ' + author['familyName'], researcher['label_en'])
                                if w_ratio > 90:
                                    found = True
                                    result_string = 'fuzzy label match: ' + str(w_ratio) + ' ' + researcher['qid'] + ' ' + researcher['label_en'] + ' / ' + author['givenName'] + ' ' + author['familyName']
                                    name = researcher['label_en']
                                    qid = researcher['qid']
                                    break
                    if not found:
                        # screen for fuzzy match to alternate names
                        for index, altname in altnames.iterrows():
                            split_names = find_surname_givens(altname['altLabel'])
                            if split_names: # skip names that don't have 2 parts
                                if split_names['family'] == author['familyName']: # require exact match to family name
                                    w_ratio = fuzz.ratio(author['givenName'] + ' ' + author['familyName'], altname['altLabel'])
                                    #w_ratio = fuzz.WRatio(author['givenName'] + ' ' + author['familyName'], altname['altLabel'])
                                    if w_ratio > 90:
                                        found = True
                                        result_string = 'researcher altname fuzzy match: ' + str(w_ratio) + ' ' + altname['qid'] + ' ' + altname['altLabel'] + ' / ' + author['givenName'] + ' ' + author['familyName']
                                        name = altname['altLabel']
                                        qid = altname['qid']
                                        break
                        if not found:
                            name = author['givenName'] + ' ' + author['familyName']
                            print('Searching Wikidata for', name)
                            print('researcher known affiliations: ', author['affiliation'])
                            print()
                            hits = search_name_at_wikidata(name, user_agent)
                            #print(hits)

                            qids = []
                            for hit in hits:
                                qids.append(hit['qid'])
                            return_list = screen_qids(qids, screens, settings['default_language'], user_agent) # screens is a global variable loaded at the start
                            #print(return_list)

                            for hit in return_list:
                                # Check each possible name match to the list of known co-authors/co-editors
                                # If there is a match, then use that Q ID and quit trying to match.
                                if hit['qid'] in list(coauthors.index):
                                    found = True
                                    qid = hit['qid']
                                    result_string = 'Match with known coauthor'
                                    
                            if not found:
                                # Save discovered data to return if not matched
                                discovered_data = []
                                for hit in return_list:                                
                                    hit_data = hit
                                    split_names = find_surname_givens(hit['label'])

                                    # Require the surname to match the Wikidata label surname exactly
                                    # This prevents a high fraction of fuzzy matches where the last names are similar but not the same
                                    if split_names: # skip names that don't have 2 parts
                                        if split_names['family'] == author['familyName']: # require exact match to family name
                                            #print(author['givenName'] + ' ' + author['familyName'], hit['label'])
                                            #print(hit)
                                            w_ratio = fuzz.WRatio(author['givenName'] + ' ' + author['familyName'], hit['label'])
                                            #print('w_ratio:', w_ratio)
                                            #ratio = fuzz.ratio(author['givenName'] + ' ' + author['familyName'], hit['label'])
                                            #print('ratio:', ratio)
                                            #partial_ratio = fuzz.partial_ratio(author['givenName'] + ' ' + author['familyName'], hit['label'])
                                            #print('partial_ratio:', partial_ratio)
                                            #token_sort_ratio = fuzz.token_sort_ratio(author['givenName'] + ' ' + author['familyName'], hit['label'])
                                            #print('token_sort_ratio:', token_sort_ratio)
                                            #token_set_ratio = fuzz.token_set_ratio(author['givenName'] + ' ' + author['familyName'], hit['label'])
                                            #print('token_set_ratio:', token_set_ratio)

                                            # This screen requires a high degree of similarity between the overall ORCID names and Wikidata labels
                                            if w_ratio > 80:
                                                print('Wikidata search fuzzy match:', w_ratio, author['givenName'] + ' ' + author['familyName'], ' / ', 'https://www.wikidata.org/wiki/'+ hit['qid'], hit['label'])
                                                print('Wikidata description: ', hit['description'])

                                                # Here we need to check Wikidata employer and affiliation and fuzzy match against known affiliations
                                                occupations, employers, affiliations = search_wikidata_occ_emp_aff(hit['qid'], settings['default_language'], user_agent)
                                                print('occupations:', occupations)
                                                hit_data['occupations'] = occupations
                                                print('employers:', employers)
                                                hit_data['employers'] = employers
                                                print('affiliations', affiliations)
                                                hit_data['affiliations'] = affiliations
                                                print()

                                                # Perform a check of the employer to make sure we didn't miss somebody in the earlier
                                                # string matching
                                                for employer in employers:
                                                    if 'Vanderbilt University' in employer: # catch university and med center
                                                        found = True
                                                        result_string = 'Match Vanderbilt employer in Wikidata: ' + hit['qid'] + ' ' + author['givenName'] + ' ' + author['familyName']
                                                        qid = hit['qid']

                                                # If the author doesn't have any known affiliations, there is no point in checking PubMed
                                                if author['affiliation'] != []:
                                                    # Search Wikidata for articles written by this match
                                                    articles_in_wikidata = search_wikidata_article(hit['qid'])
                                                    #print(articles_in_wikidata)

                                                    # Step through articles with PubMed IDs found in Wikidata and see if the author affiliation or ORCID matches any of the articles
                                                    check = 0
                                                    for article_in_wikidata in articles_in_wikidata:
                                                        if article_in_wikidata['pmid'] != '':
                                                            check += 1
                                                            if check > max_pmids_to_check:
                                                                print('More articles, but stopping after checking', max_pmids_to_check)
                                                                break # break out of article-checking loop
                                                            print('Checking article, PMID:', article_in_wikidata['pmid'], article_in_wikidata['title'])
                                                            pubmed_match = identified_in_pubmed(article_in_wikidata['pmid'], author['givenName'] + ' ' + author['familyName'], author['affiliation'], author['orcid'])
                                                            if not pubmed_match:
                                                                #print('no match')
                                                                print()
                                                            else:
                                                                found = True
                                                                result_string = 'PubMed affilation match: ' + hit['qid'] + ' ' + author['givenName'] + ' ' + author['familyName']
                                                                qid = hit['qid']
                                                                break # break out of article-checking loop

                                                if found:
                                                    break # break out of hit list loop
                                                print()
                                                # If none of the matching criteria are met, save the data for future use
                                                discovered_data.append(hit_data)

        if not found:
            not_found_author_list.append({'name_string': author['givenName'] + ' ' + author['familyName'], 'series_ordinal': author_count, 'possible_matches': discovered_data})
            print('not found:', author['givenName'] + ' ' + author['familyName'])

        else:
            found_qid_values.append({'qid': qid, 'stated_as': stated_as, 'series_ordinal': author_count})
            print(result_string)
            for index, department in departments.iterrows():
                if qid == department['qid']:
                    for lindex, department_label in department_labels.iterrows():
                        if department_label['qid'] == department['affiliation']:
                            print(department_label['label_en'])
                            break
        print()
        author_count += 1

    print()
    return found_qid_values, not_found_author_list


# Preprocessing

Idiosyncratic steps that need to be done between the Zotero output and running the "standardized" script.


In [None]:
# This step involves re-setting the Url column to use the screened URLs if the Zotero output title matches
# the title in the screened full work CSV.

# Set up log for warnings
logging.basicConfig(filename='warnings.log', filemode='w', format='%(message)s', level=logging.WARNING)

with open('settings.yaml', 'r') as file_object:
    settings = yaml.safe_load(file_object)

# The user_agent string identifies this application to Wikimedia APIs.
# If you modify this script, you need to change the user-agent string to something else!
user_agent = 'PubLoader/' + settings['script_version'] + ' (mailto:' + settings['operator_email_address'] + ')'
    
# These are the pre-screened "full works available" URLs that Charlotte prepared.
full_works = pd.read_csv('full_work_div_pub.csv', na_filter=False, dtype = str)
full_works = full_works.set_index('Title')

source_data = settings['file_path'] + settings['source_data_filename']
works = pd.read_csv(source_data, na_filter=False, dtype = str)
works = works.iloc[1950:2000]

for label, work_series in works.iterrows():
    try:
        # Find the row(s) in the full_works DataFrame that matches the series. There should be only one.
        # Create a series of URL values for those rows. Since there should be only one, get the 0th value.
        new_url = full_works.loc[full_works.index==work_series['Title'], 'Url'][0]
        # Set a new value for the Url column in the works DataFrame using the looked-up URL.
        works.loc[label, 'Url'] = new_url
        
    except:
        pass
    
    if work_series['Item Type'] == 'bookSection':
        works.loc[label, 'parent_isbn'] = work_series['ISBN']
        works.loc[label, 'ISBN'] = ''
    else:
        works.loc[label, 'parent_isbn'] = ''
        
    # Extract the DOI for book chapters from the Extra field 
    # and put it in the appropriate column.
    if work_series['DOI'] == '':
        works.loc[label, 'DOI'] = extract_identifier_from_extra(work_series['Extra'], 'DOI')

works.to_csv(settings['file_path'] + 'preprocessed.csv', index = False)
print('done')

# Main routine

NOTES: 
- Before continuing on after this step, you need to correct any of the publication locations that weren't determined.
- It also would be a good idea to run Author Disambiguator on authors before retrieving the existing works, since if they are only listed under author name string, they won't get picked up and you might end up generating duplicates

In [None]:
print('loading data from files')
publishers = pd.read_csv(settings['file_path'] + 'publishers.csv', na_filter=False, dtype = str)
publishers = publishers.set_index('qid')

publisher_locations = pd.read_csv(settings['file_path'] + 'publisher_locations.csv', na_filter=False, dtype = str)
publisher_locations = publisher_locations.set_index('qid')

researchers = pd.read_csv(settings['reference_file_path'] + 'researchers.csv', na_filter=False, dtype = str)
altnames = pd.read_csv(settings['reference_file_path'] + 'vanderbilt_wikidata_altlabels.csv', na_filter=False, dtype = str)
departments = pd.read_csv(settings['reference_file_path'] + 'departments.csv', na_filter=False, dtype = str)
department_labels = pd.read_csv(settings['reference_file_path'] + 'department_labels.csv', na_filter=False, dtype = str)

works = pd.read_csv(settings['file_path'] + 'preprocessed.csv', na_filter=False, dtype = str)

with open(settings['file_path'] + 'config.yaml', 'r') as file_object:
    config = yaml.safe_load(file_object)

with open(settings['file_path'] + 'mapping.yaml', 'r') as file_object:
    mapping = yaml.safe_load(file_object)

with open(settings['file_path'] + 'mapping_agents.yaml', 'r') as file_object:
    mapping_agents = yaml.safe_load(file_object)
    
# screens.yaml is a configuration file that defines the kinds of screens to be performed on potential agent Q ID matches from Wikidata
with open(settings['file_path'] + 'screens.yaml', 'r') as file_object:
    screens = yaml.safe_load(file_object)

print('retrieving existing works from Wikidata')
query_string = '''select distinct ?work ?workLabel ?doi ?pmid where {
  {?author wdt:P1416 wd:Q7914452.} # Div school
  union
  {?author wdt:P1416 wd:Q114065689.} # graduate department of religion

  {?work wdt:P50 ?author.} # author
  union
  {?work wdt:P98 ?author.} # editor

  optional {
    ?work rdfs:label ?workLabel.
    filter(lang(?workLabel)="''' + settings['default_language'] + '''")
    }

  optional {?work wdt:P356 ?doi.}
  optional {?work wdt:P698 ?pmid.}  
  }
'''

wdqs = Sparqler(useragent=user_agent)
query_results = wdqs.query(query_string)
sleep(settings['sparql_sleep'])

found_works = []
for result in query_results:
    work_dict = {}
    #work_dict['qid'] = extract_local_name(result['work']['value'])
    work_dict['qid'] = result['work']['value']
    if 'workLabel' in result:
        work_dict['label'] = result['workLabel']['value']
    else:
        work_dict['label'] = ''
    if 'doi' in result:
        work_dict['doi'] = result['doi']['value'].upper() # valid DOIs are all upper case, but could be some bad ones
    else:
        work_dict['doi'] = ''
    if 'pmid' in result:
        work_dict['pmid'] = result['pmid']['value']
    else:
        work_dict['pmid'] = ''
    found_works.append(work_dict)
existing_works_df = pd.DataFrame(found_works) # Note: qids are full IRIs
existing_works_df = existing_works_df.sort_values(by=['qid'])
existing_works_df.to_csv(settings['file_path'] + 'existing_works_in_wikidata.csv', index = False)

print('retrieving author/editor data from Wikidata')
existing_works_qids_list = list(existing_works_df.loc[:, 'qid']) # Generate a list of work Q IDs from the qid column
existing_works_qids_string = '>\n<'.join(existing_works_qids_list) # Join the list into a string with one Q ID per line
existing_works_qids_string = '<' + existing_works_qids_string + '>'

query_string = '''
select distinct ?agent ?label ?orcid where {
  VALUES ?value
  {
  ''' + existing_works_qids_string + '''}
  
{?value wdt:P50 ?agent.}
union
{?value wdt:P98 ?agent.}

?agent rdfs:label ?label.
FILTER(lang(?label)="''' + settings['default_language'] + '''")

optional {?agent wdt:P496 ?orcid.}

MINUS # remove Vanderbilt Div people
{
  {?agent wdt:P1416 wd:Q7914452.} # Div school
  union
  {?agent wdt:P1416 wd:Q114065689.} # graduate department of religion
}
  }
'''
#print(query_string)

wdqs = Sparqler(useragent=user_agent)
query_results = wdqs.query(query_string)
sleep(settings['sparql_sleep'])

coauthors = []
for result in query_results:
    author_dict = {}
    author_dict['qid'] = extract_local_name(result['agent']['value'])
    if 'label' in result:
        author_dict['label'] = result['label']['value']
    else:
        author_dict['label'] = ''
    if 'orcid' in result:
        author_dict['orcid'] = result['orcid']['value']
    else:
        author_dict['orcid'] = ''
    coauthors.append(author_dict)
coauthors = pd.DataFrame(coauthors) # NOTE: Q IDs don't include Wikidata namespace
coauthors = coauthors.sort_values(by=['qid'])
coauthors.to_csv(settings['file_path'] + 'coauthors_from_wikidata.csv', index = False)

print('done retrieving author/editor data')


Note: before running this for the first time, you need to generate `works.csv` with only the column headers and no data rows. This file can be created using the convert_yaml_to_metadata_schema.py if the config.yaml file is correctly set up. The column headers also need to be saved as `works_questionable_subtitles.csv`.

In [None]:
# Initiate error logging
error_log_object = open(settings['log_path'] + 'log_error.txt', 'at', encoding='utf-8') # direct output to log_object to log file instead of sys.stdout
skipped_log_object = open(settings['log_path'] + 'log_skipped.tsv', 'at', encoding='utf-8')
found_log_object = open(settings['log_path'] + 'log_found.tsv', 'at', encoding='utf-8')

# Load existing data for works written/to be written
works_df = pd.read_csv(settings['file_path'] + 'works.csv', na_filter=False, dtype = str)

# Load existing data for works with uncertain subtitle status
works_subtitle_df = pd.read_csv(settings['file_path'] + 'works_questionable_subtitles.csv', na_filter=False, dtype = str)

agents_list = []
for index, work_data in works.iterrows():
    print()
    print(work_data[mapping['constants']['label_column']])
    
    # Use the mappings to extract and process the main metadata from the source columns
    row = extract_metadata(mapping, work_data, settings)
    
    # Check whether the work is already in Wikidata
    wikidata_status = work_in_wikidata_status(row['label_en'], row['doi'], row['pmid'], existing_works_df, settings, verbose=True)
    if wikidata_status == 'not found' or wikidata_status == 'possible partial title':
        agents_dict = {'unique_identifier': work_data[mapping['constants']['unique_identifier_column']]}

        # For each agent type (author, editor, etc.) extract the name information
        for agent_type in mapping_agents['sources']:            
            if 'structure_name_string' in agent_type:
                data_structure = eval(agent_type['structure_name_string'])
            else:
                data_structure = None
            
            source_column = agent_type['in_col_label']
            agent_structured_data = build_function(agent_type['mapping_function'], work_data[source_column], settings, data_structure)
            agents_dict[agent_type['out_col_label']] = json.dumps(agent_structured_data)

        # Get the reference values for that work
        has_values = False
        for reference_type in mapping_agents['ref']:
            if 'structure_name_string' in reference_type:
                data_structure = eval(reference_type['structure_name_string'])
            else:
                data_structure = None

            no_value, output_value = evaluate_function(reference_type, work_data, settings, data_structure)
            if no_value:
                agents_dict[reference_type['out_col_label']] = ''
            else:
                has_values = True
                agents_dict[reference_type['out_col_label']] = output_value
                
        # Special handling for book chapters; need to find the containing book to use for published_in
        if row['instance_of'] == 'Q21481766': # Q ID for academic chapter
            book_error, book_qid = find_containing_book(work_data['parent_isbn'], row['label_' + settings['default_language']], settings['default_language'], user_agent)
            if not book_error:
                row['published_in'] = book_qid

        # Do not add the work to the list if there is no author or editor information
        if not has_values:
            print('Warning! No agents associated with this work. Not added to output files.')
            print(row['unique_identifier'] + '\t' + 'No agents associated with this work.\t' + row['label_' + settings['default_language']].replace('"',''), file=skipped_log_object)
            continue
            
        # Do not add the work to the list if it's a journal article or book chapter and has no published_in value
        if (row['instance_of'] in settings['contained_types']) and (row['published_in'] == ''):
            print('Warning! Article or chapter without published_in. Not added to output files.')
            print(row['unique_identifier'] + '\t' + 'Article or chapter without published_in.\t' + row['label_' + settings['default_language']].replace('"',''), file=skipped_log_object)
            continue
            
        # Do not add the work unless it is one of the known work types
        # Warning already given in the processing function
        if row['instance_of'] == '':
            print('Warning! Unknown work type. Not added to output files.')
            print(row['unique_identifier'] + '\t' + 'Unknown work type.\t' + row['label_' + settings['default_language']].replace('"',''), file=skipped_log_object)
            continue
        
        # Append dict to end of works DataFrame
        if wikidata_status == 'not found':
            works_df = works_df.append(row, ignore_index=True)
        else: # status 'possible partial title'
            works_subtitle_df = works_subtitle_df.append(row, ignore_index=True)

        # Append agents data to the agents list
        agents_list.append(agents_dict)
        
        # Log any errors that occurred
        print(row['unique_identifier'] + ' ' + row['label_' + settings['default_language']], file=error_log_object)
        
        # Read the warnings log
        with open('warnings.log', 'rt') as file_object:
            warnings_text = file_object.read()
        if warnings_text == '':
            print('No errors occurred.', file=error_log_object)
        else:
            print(warnings_text, file=error_log_object)
        print('', file=error_log_object)
        
        # Clear the warnings log
        with open('warnings.log', 'w'):
            pass
        
    # Log cases where work is found in Wikidata
    else:
        label = row['label_' + settings['default_language']]
        label = label.replace('"', '') # get rid of quotes that will mess up the TSV
        label = label.replace('\t', '') # get rid of tabs that will mess up the TSV
        print(row['unique_identifier'] + '\t' + wikidata_status + '\t' + label, file=found_log_object)

    print()
    
agents_frame = pd.DataFrame(agents_list)

works_df.to_csv(settings['file_path'] + 'works.csv', index = False)
works_subtitle_df.to_csv(settings['file_path'] + 'works_questionable_subtitles.csv', index = False)
agents_frame.to_csv(settings['file_path'] + 'stored_retrieved_agents.csv', index = False)

error_log_object.close()
skipped_log_object.close()
found_log_object.close()
print('done')


# Interlude

After running the code above, the VanderBot script must be run on the `works.csv` file.

Before running this script the first time, empty agents files (`author.csv`, `editor.csv`, `translator.csv`, etc.) must be created with appropriate column headers but no data rows. These files can be created using the `convert_yaml_to_metadata_schema.py` if the `config.yaml` file is correctly set up. 

The following code must be run, and then run the VanderBot script again to add the author and author string data.

NOTE: The coauthor screen is really effective at decreasing the amount of searching that needs to be done in Wikidata. So after a first pass at running this code, one can examine the `unidentified_people.json` file to find the obvious matches (people listed as theologians, people with unusualy names that match exactly, etc.), then add them to the `coauthors_from_wikidata.csv` file. Even though they aren't actually coauthors yet, they will become coauthors as soon as the agents data generated here are uploaded to Wikidata, and if this code cell is then re-run, those matched people will automatically get put correctly into the `author.csv` or `editor.csv` file. This is much simpler than trying to manually move them from `author_strings.csv` or to manually enter them in `editor.csv`.


In [None]:
user_response = input('Write to file? (y/<cr>) ')
if user_response == '':
    allow_writing_agents = False
else:
    allow_writing_agents = True

today_date = calculate_todays_date()

# Create a dictionary to hold a DataFrame for each agent type to append data to as it is generated,
# and load it with existing agents data.
agents_dict = {}
for agent_type in mapping_agents['sources']:
    agents_dict[agent_type['out_col_label']] = pd.read_csv(settings['file_path'] + agent_type['out_col_label'] + '.csv', na_filter=False, dtype = str)

# Load existing author strings data
author_strings_df = pd.read_csv(settings['file_path'] + 'author_strings.csv', na_filter=False, dtype = str)
unidentified = []

# Open the file containing known co-authors/co-editors
coauthors = pd.read_csv(settings['file_path'] + 'coauthors_from_wikidata.csv', na_filter=False, dtype = str)
coauthors = coauthors.set_index('qid')

# Open the file containing the stored data about authors and editors retrieved from the data source
stored_retrieved_agents = pd.read_csv(settings['file_path'] + 'stored_retrieved_agents.csv', na_filter=False, dtype = str)
stored_retrieved_agents = stored_retrieved_agents.set_index('unique_identifier')

# Open the works items file after upload in order to get the Q IDs for the newly written works
processed_works = pd.read_csv(settings['file_path'] + 'works.csv', na_filter=False, dtype = str)
processed_works = processed_works.set_index('unique_identifier')

# The source data file needs to be opened only to support the hack for the Vanderbilt Divinity School database
# that decides whether or not to include series ordinal
raw_works = pd.read_csv(settings['file_path'] + 'preprocessed.csv', na_filter=False, dtype = str)
raw_works = raw_works.set_index('Key')

for work_unique_identifier, work in processed_works.iterrows():
    # The processed_works DataFrame will have rows for works whose agents have previously been written.
    # In those cases, the unique identifier won't be in the retrieved_agents list, so they should be skipped.
    if work_unique_identifier not in list(stored_retrieved_agents.index):
        continue
        
    qid = work['qid']
    doi = work['doi']
    pmid = work['pmid']
    print(qid, work_unique_identifier)
    unidentified_for_work = {'qid': 'https://wikidata.org/entity/' + qid, 'unique_identifier': work_unique_identifier}
    
    # NOTE: in order for this lookup to work, the unique_identifier for the work must actually be unique in the
    # processed works table.    
    work_agents = stored_retrieved_agents.loc[work_unique_identifier] # result is a Series if unique
    
    unidentifieds_exist = False
    for agent_type in mapping_agents['sources']: # agent_type includes authors, editors, translators, etc.
        agent_type_name = agent_type['out_col_label']
        print('disamblguating', agent_type_name)

        # Disambiguate agents against existing Wikidata people items
        agents = json.loads(work_agents[agent_type_name])
        found_agent_qids, author_name_strings = disambiguate_agents(agents, pmid, coauthors, settings, user_agent)

        # Add data about unidentified people with possible Q ID matches to the list for further work.
        if len(author_name_strings) != 0:
            unidentifieds_exist = True
        unidentified_for_work[agent_type_name] = author_name_strings
        
        suppress_series_ordinal = False
        # Don't use series ordinal if there is only one agent in the category
        if len(agents) <= 1:
            suppress_series_ordinal = True
        if mapping_agents['constants']['suppress_series_ordinal']:
            suppress_series_ordinal = True
        # Special hack for Vanderbilt Divinity database. Order of agents is not reliable if manually entered.
        # Order of agents is reliable if retrieved automatically from a cataloging source 
        # (i.e. has a value in the "Library Catalog" field).
        if raw_works.loc[work_unique_identifier]['Library Catalog'] == '':
            suppress_series_ordinal = True

        # Create a list of dictionaries
        for agent in found_agent_qids:
            out_dict = {}
            out_dict['qid'] = qid
            out_dict['label_' + settings['default_language']] = work['label_' + settings['default_language']]
            out_dict[agent_type_name + '_uuid'] = ''
            out_dict[agent_type_name] = agent['qid']
            
            out_dict[agent_type_name + '_stated_as'] = agent['stated_as']

            if suppress_series_ordinal:
                out_dict[agent_type_name + '_series_ordinal'] = ''
            else:
                out_dict[agent_type_name + '_series_ordinal'] = agent['series_ordinal']
                        
            # Loop through all of the reference types specified in the agents mapping file
            out_dict[agent_type_name + '_ref1_hash'] = ''
            for reference_type in mapping_agents['ref']:
                out_dict[agent_type_name + '_ref1_' + reference_type['out_col_label']] = work_agents[reference_type['out_col_label']]
            out_dict[agent_type_name + '_ref1_retrieved_nodeId'] = ''
            out_dict[agent_type_name + '_ref1_retrieved_val'] = today_date
            out_dict[agent_type_name + '_ref1_retrieved_prec'] = ''
            
            # Append dict to end of DataFrame for that particular agent type
            agents_dict[agent_type_name] = agents_dict[agent_type_name].append(out_dict, ignore_index=True)
            
        # Save after each work in case of script crash
        if allow_writing_agents:
            if len(found_agent_qids) > 0:
                agents_dict[agent_type_name].to_csv(settings['file_path'] + agent_type_name + '.csv', index = False)

        # Special treatment for authors since only authors have a "name string property"
        if agent_type_name == 'author':
            for author in author_name_strings:
                out_dict = {}
                out_dict['qid'] = qid
                out_dict['label_' + settings['default_language']] = work['label_' + settings['default_language']]
                out_dict['author_string_uuid'] = ''
                out_dict['author_string'] = author['name_string']
            
                if suppress_series_ordinal:
                    out_dict['author_string_series_ordinal'] = ''
                else:
                    out_dict['author_string_series_ordinal'] = author['series_ordinal']
                    
                out_dict['author_string_ref1_hash'] = ''
                for reference_type in mapping_agents['ref']:
                    out_dict['author_string_ref1_' + reference_type['out_col_label']] = work_agents[reference_type['out_col_label']]
                out_dict['author_string_ref1_retrieved_nodeId'] = ''
                out_dict['author_string_ref1_retrieved_val'] = today_date
                out_dict['author_string_ref1_retrieved_prec'] = ''
                
                # Append dict to end of DataFrame
                author_strings_df = author_strings_df.append(out_dict, ignore_index=True)
                
            #  Save after each work in case of script crash
            if allow_writing_agents:
                if len(author_name_strings) > 0:
                    author_strings_df.to_csv(settings['file_path'] + 'author_strings.csv', index = False)

    if unidentifieds_exist:
        unidentified.append(unidentified_for_work)
        
    # Save the potential author and editor matches in a file
    # Save after each work in case of crash; maybe later just write at end
    with open(settings['file_path'] + 'unidentified_people.json', 'wt', encoding='utf-8') as file_object:
        file_object.write(json.dumps(unidentified, indent=2))

print('done')


Code for extracting data from skipped log and joining it with fields from the original dataset

In [None]:
with open('settings.yaml', 'r') as file_object:
    settings = yaml.safe_load(file_object)
    
source_data = settings['file_path'] + settings['source_data_filename']
works = pd.read_csv(source_data, na_filter=False, dtype = str)
works = works.set_index('Key')

skipped = pd.read_csv('log_skipped.csv', na_filter=False, dtype = str)
skipped = skipped.set_index('Key')

skipped_merge = pd.merge(skipped, works, on=['Key'], how='inner')
out = skipped_merge.loc[:, 'reason':'ISSN']
out = out.drop(['title', 'Divinity Faculty'], axis='columns')
out.to_csv('skipped_works.csv', index = True)