In [None]:
# publoader.ipynb, a Python script for uploading files and data to Wikimedia Commons using the API.
# version 0.1.0

# (c) 2023 Vanderbilt University. This program is released under a GNU General Public License v3.0 http://www.gnu.org/licenses/gpl-3.0
# Author: Steve Baskauf

# ----------------
# Module imports
# ----------------

from typing import List, Dict, Tuple, Any, Optional
import yaml
import sys
#import csv
from datetime import datetime
from time import sleep
import json
import pandas as pd
import requests
import requests_cache
from fuzzywuzzy import fuzz # fuzzy logic matching
from langdetect import detect_langs
import re # regex
import logging # See https://docs.python.org/3/howto/logging.html

# module located in same directory as script
import mapping_functions

# Set up cache for HTTP requests
requests_cache.install_cache('wqs_cache', backend='sqlite', expire_after=300, allowable_methods=['GET', 'POST'])

# ------------------------
# Utility functions
# ------------------------

def calculate_todays_date() -> str:
    """Generate the current UTC xsd:date."""
    whole_time_string_z = datetime.utcnow().isoformat() # form: 2019-12-05T15:35:04.959311
    date_z = whole_time_string_z.split('T')[0] # form 2019-12-05
    return date_z

def extract_local_name(iri: str) -> str:
    """Extract the local name part of an IRI, e.g. a Q ID from a Wikidata IRI.
    
    Note:
    -----
    Expected IRI pattern is http://www.wikidata.org/entity/Q6386232
    """
    pieces = iri.split('/')
    return pieces[-1]

def include_reference_url(url: str, full_works: pd.DataFrame) -> str:
    """Determine whether a documentation URL is suitable to be used as a reference URL or full text available."""
    url_pattern = "^https?:\\/\\/(?:www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b(?:[-a-zA-Z0-9()@:%_\\+.~#?&\\/=]*)$"
    url_inclusion_strings = [
    'doi',
    'jstor',
    #'oxfordjournals.org/content',
    'article',
    'academia.edu',
    'content',
    'proquest.com/docview',
    'handle'
    ]
    
    url_exclusion_strings = [
    'login',
    'proxy',
    #'search.proquest.com',
    'worldcat',
    'wp-content',
    'site.ebrary.com',
    'cro3.org/',
    'worldbookonline.com/pl/infofinder'
    ]

    url = url.lower() # convert to all lowercase
    
    # Exclude invalid URLs
    if re.match(url_pattern, url) is None:
        return ''

    # If the URL matches one of the pre-screened URLs, use it
    matched_series = full_works.loc[full_works['Url']==url, 'Url']
    # matched_series will be a Series composed of all values in the Url column that match. There should be 1 or 0.
    if len(matched_series) == 1:
        return url
    
    # Exclude any URLs containing strings that indicate a login is required
    for screening_string in url_exclusion_strings:
        if screening_string in url:
            return ''
        
    # Must contain one of the strings that indicate metadata and possible acces
    for screening_string in url_inclusion_strings:
        if screening_string in url:
            return url
        
    return ''

def set_description(string: str, work_types: List[dict]) -> str:
    """Match the type string with possible types for the data source and return the description."""
    if string == '':
        return ''
    
    for work_type in work_types:
        if string == work_type['type_string']:
            return work_type['description']

    print('No description, did not find datatype for type:', string)
    logging.warning('No description, did not find datatype for type: ' + string)
    return ''

def title_if_no_lowercase(string: str) -> str:
    """Change to titlecase only if there are no lowercase letters in the string."""
    lower = 'abcdefghijklmnopqrstuvwxyz'
    is_lower = False
    for letter in string:
        if letter in lower:
            is_lower = True
    if is_lower:
        return string
    else:
        return string.title()

def fix_all_caps(name_pieces: List[str]) -> List[str]:
    """Correct the capitalization for a list of name parts that are in all caps."""
    clean_pieces = []
    for piece in name_pieces:
        # Special handing for names starting with apostrophe-based prefixes
        apostrophe_list = ["van't", "'t", "O'", "D'", "d'", "N'"]
        apostrophe_prefix = ''
        for possible_apostrophe_prefix in apostrophe_list:
            if possible_apostrophe_prefix in piece:
                # Remove prefix
                piece = piece.replace(possible_apostrophe_prefix, '')
                apostrophe_prefix = possible_apostrophe_prefix
        
        # Special handling for name parts that are lowercase
        lower_case_list = ['von', 'de', 'van', 'la', 'der']
        if piece.lower() in lower_case_list:
            piece = piece.lower()
        else:
            # Special handling for hyphenated names; doesn't work for an edge case with more than 2 hyphens
            if '-' in piece:
                halves = piece.split('-')
                piece = title_if_no_lowercase(halves[0]) + '-' + title_if_no_lowercase(halves[1])
            else:
                piece = title_if_no_lowercase(piece)
        
        # put any apostrophe prefix back on the front
        if apostrophe_prefix:
            piece = apostrophe_prefix + piece
        
        clean_pieces.append(piece)
    return clean_pieces
  
def extract_name_pieces(name: str) -> Tuple[List[str], str]:
    """Extract parts of names. Recognize typical male suffixes. Fix ALL CAPS if present."""
    # treat commas as if they were spaces
    name = name.replace(',', ' ')
    # get rid of periods, sometimes periods are close up with no spaces
    name = name.replace('.', ' ')

    pieces = name.split(' ')
    while '' in pieces:
        pieces.remove('')
    
    # Remove ", Jr.", "III", etc. from end of name
    if pieces[len(pieces)-1] == 'Jr':
        pieces = pieces[0:len(pieces)-1]
        suffix = ', Jr.'
    elif pieces[len(pieces)-1] == 'II':
        pieces = pieces[0:len(pieces)-1]
        suffix = ' II'
    elif pieces[len(pieces)-1] == 'III':
        pieces = pieces[0:len(pieces)-1]
        suffix = ' III'
    elif pieces[len(pieces)-1] == 'IV':
        pieces = pieces[0:len(pieces)-1]
        suffix = ' IV'
    elif pieces[len(pieces)-1] == 'V':
        pieces = pieces[0:len(pieces)-1]
        suffix = ' V'
    elif len(pieces) > 3 and pieces[len(pieces)-2] == 'the' and pieces[len(pieces)-1] == 'elder':
        pieces = pieces[0:len(pieces)-2]
        suffix = ' the elder'
    else:
        suffix = ''
        
    # Fix stupid situation where name is written in ALL CAPS
    pieces = fix_all_caps(pieces)
    return pieces, suffix
    
def extract_identifier_from_extra(extra_field: str, id_name: str) -> str:
    """Extract a specified identifier (book DOI, article PMID) from the Zotero export Extra field."""
    identifier = ''
    tokens = extra_field.split(' ')
    for token_index in range(len(tokens)):
        if tokens[token_index] == id_name + ':': # match the tag for the desired ID
            # The identifer is the next token after the tag
            identifier = tokens[token_index + 1]
            break
    return identifier

def search_name_at_wikidata(name: str, user_agent: str) -> List[Dict[str, str]]:
    """Carry out a search of labels in languages that use Latin characters, and other commonly used languages.
    
    Returns
    -------
    A list of dictionaries providing the Q IDs and names that match the passed-in name.
    
    Note
    ----
    See https://doi.org/10.1145/3233391.3233965 for reference.
    """
    language_codes = [
        'en',
        'es',
        'pt',
        'fr',
        'it',
        'nl',
        'de',
        'da',
        'et',
        'hu',
        'ga',
        'ro',
        'sk',
        'sl',
        'zu',
        'tr',
        'sv',
        'zh',
        'ru',
        'ja',
        'ar',
        'pl',
        'uk',
        'ca',
        'cs',
        'la',
        'nb',
        'hu',
        'he',
        'eo',
        'fi',
        'ko'
      ]
    name_list = generate_name_alternatives(name)
    alternatives = ''
    for language_code in language_codes:
        for alternative in name_list:
            # get rid of quotes, which will break the query
            alternative = alternative.replace('"', '')
            alternative = alternative.replace("'", '')
            alternatives += '"' + alternative + '"@' + language_code + '\n'
    query = '''
select distinct ?item ?label where {
  VALUES ?value
  {
  ''' + alternatives + '''}
?item rdfs:label|skos:altLabel ?value.
?item rdfs:label ?label.
FILTER(lang(?label)='en')
  }
'''
    #print(query)
    #print('searching for ', name)
    
    wdqs = Sparqler(useragent=user_agent)
    statements = wdqs.query(query)
    sleep(settings['sparql_sleep'])

    results = []
    for statement in statements:
        wikidata_iri = statement['item']['value']
        if 'label' in statement:
            name = statement['label']['value']
        else:
            name = ''
        qnumber = extract_local_name(wikidata_iri)
        results.append({'qid': qnumber, 'name': name})
    return results

def search_wikidata_occ_emp_aff(qid: str, default_language: str, user_agent: str) -> Tuple[List[str], List[str], List[str]]:
    """Search Wikidata by Q ID for occupation, employer, and affiliation claims.
    
    Returns
    -------
    Lists of occupations, employers, and affiliations.
    """
    results_list = []

    query_string = '''select distinct ?occupation ?employer ?affiliation where {
        optional {
            wd:'''+ qid + ''' wdt:P106 ?occupationId.
            ?occupationId rdfs:label ?occupation.
            FILTER(lang(?occupation) = "'''+ default_language + '''")            
            }

        optional {
            wd:'''+ qid + ''' wdt:P108 ?employerId.
            ?employerId rdfs:label ?employer.
            FILTER(lang(?employer) = "'''+ default_language + '''")            
            }

        optional {
            wd:'''+ qid + ''' wdt:P1416 ?affiliationId.
            ?affiliationId rdfs:label ?affiliation.
            FILTER(lang(?affiliation) = "'''+ default_language + '''")            
            }
        }'''
    #print(query_string)
    
    wdqs = Sparqler(useragent=user_agent)
    statements = wdqs.query(query_string)
    sleep(settings['sparql_sleep'])
    #print(statements)
    
    # pull all possible occupations
    occupationList = []
    employerList = []
    affiliationList = []
    for statement in statements:
        if 'occupation' in statement:
            occupationList.append(statement['occupation']['value'])
        if 'employer' in statement:
            employerList.append(statement['employer']['value'])
        if 'affiliation' in statement:
            affiliationList.append(statement['affiliation']['value'])
    occupationList = list(set(occupationList))
    employerList = list(set(employerList))
    affiliationList = list(set(affiliationList))
    #print(occupationList)
    #print(employerList)
    #print(affiliationList)
    
    return occupationList, employerList, affiliationList 


def find_surname_givens(name: str) -> Dict[str, str]:
    """Extract surname and given names from a full name string and remove typical male suffixes."""
    # Get rid of periods and commas
    name = name.replace('.', ' ')
    name = name.replace(',', ' ')
    
    # Split name
    pieces = name.split(' ')
    
    # Get rid of empty pieces formed from extra spaces
    while '' in pieces:
        pieces.remove('')
        
    # Must be at least a surname and something else
    if len(pieces) <= 1:
        return False
    
    # Make sure first character is alphabetic
    # only fixes the case where there is one alphanumeric, but more than one is rare
    # typical cases are like (Kit) or "Kit"    
    for piece_index in range(len(pieces)):
        if not pieces[piece_index][0:1].isalpha(): 
            pieces[piece_index] = pieces[piece_index][1:len(pieces)] # remove the first non-alphabetic character
    # Now get rid of any empty strings; could also be caused by double spaces
    for piece in pieces:
        if len(piece) == 0: # there's nothing left, get rid of piece
            pieces.remove('')
            
    # Get rid of ", Jr.", "III", etc.
    if 'Jr' in pieces:
        pieces.remove('Jr')
    if 'Sr' in pieces:
        pieces.remove('Sr')
    if 'II' in pieces:
        pieces.remove('II')
    if 'III' in pieces:
        pieces.remove('III')
    if 'IV' in pieces:
        pieces.remove('IV')
    if 'V' in pieces:
        pieces.remove('V')
    
    # Not interested unless there are at least two pieces
    if len(pieces) == 1:
        return False
    
    # Put all but last piece together again
    given_names = ''
    for piece in pieces[0:len(pieces)-2]:
        given_names += piece + ' '
    given_names += pieces[len(pieces)-2]
    
    return {'given': given_names, 'family': pieces[len(pieces)-1]}

def generate_name_alternatives(name: str) -> List[str]:
    """Generate permutations of names and initials (with and without periods) for a label and alias query."""
    # treat commas as if they were spaces
    name = name.replace(',', ' ')
    # get rid of periods, sometimes periods are close up with no spaces
    name = name.replace('.', ' ')

    pieces = name.split(' ')
    while '' in pieces:
        pieces.remove('')
    
    # Remove ", Jr.", "III", etc. from end of name
    if pieces[len(pieces)-1] == 'Jr':
        pieces = pieces[0:len(pieces)-1]
        suffix = ', Jr.'
    elif pieces[len(pieces)-1] == 'II':
        pieces = pieces[0:len(pieces)-1]
        suffix = ' II'
    elif pieces[len(pieces)-1] == 'III':
        pieces = pieces[0:len(pieces)-1]
        suffix = ' III'
    elif pieces[len(pieces)-1] == 'IV':
        pieces = pieces[0:len(pieces)-1]
        suffix = ' IV'
    elif pieces[len(pieces)-1] == 'V':
        pieces = pieces[0:len(pieces)-1]
        suffix = ' V'
    elif len(pieces) > 3 and pieces[len(pieces)-2] == 'the' and pieces[len(pieces)-1] == 'elder':
        pieces = pieces[0:len(pieces)-2]
        suffix = ' the elder'
    else:
        suffix = ''
        
    # Fix stupid situation where name is written in ALL CAPS
    pieces = fix_all_caps(pieces)        

    # generate initials for all names
    initials = []
    for piece in pieces:
        # make sure first character is alphabetic
        # only fixes the case where there is one alphanumeric, but more than one is rare
        # typical cases are like (Kit) or "Kit"
        if not piece[0:1].isalpha():
            piece = piece[1:len(piece)] # remove the first non-alphabetic character
        if len(piece) > 0:
            initials.append(piece[0:1])
        
    alternatives = []
    # full name
    name_version = ''
    for piece_number in range(0, len(pieces)-1):
        name_version += pieces[piece_number] + ' '
    name_version += pieces[len(pieces)-1]
    alternatives.append(name_version)
    
    # full name with suffix
    if suffix != '':
        name_version = ''
        for piece_number in range(0, len(pieces)-1):
            name_version += pieces[piece_number] + ' '
        name_version += pieces[len(pieces)-1] + suffix
        alternatives.append(name_version)
    
    # first and last name with initials
    name_version = pieces[0] + ' '
    for piece_number in range(1, len(pieces)-1):
        name_version += initials[piece_number] + ' '
    name_version += pieces[len(pieces)-1]
    alternatives.append(name_version)
    
    # first and last name with initials and periods
    name_version = pieces[0] + ' '
    for piece_number in range(1, len(pieces)-1):
        name_version += initials[piece_number] + '. '
    name_version += pieces[len(pieces)-1]
    alternatives.append(name_version)

    # first and last name only
    name_version = pieces[0] + ' '
    name_version += pieces[len(pieces)-1]
    alternatives.append(name_version)

    # first initial and last name only
    name_version = initials[0] + ' '
    name_version += pieces[len(pieces)-1]
    alternatives.append(name_version)

    # first initial with period and last name only
    name_version = initials[0] + '. '
    name_version += pieces[len(pieces)-1]
    alternatives.append(name_version)

    # first initial no period and all other names
    name_version = initials[0] + ' '
    for piece_number in range(1, len(pieces)-1):
        name_version += pieces[piece_number] + ' '
    name_version += pieces[len(pieces)-1]
    alternatives.append(name_version)

    # first initial with period and all other names
    name_version = initials[0] + '. '
    for piece_number in range(1, len(pieces)-1):
        name_version += pieces[piece_number] + ' '
    name_version += pieces[len(pieces)-1]
    alternatives.append(name_version)

    # all name initials with last name
    name_version = initials[0] + ' '
    for piece_number in range(1, len(pieces)-1):
        name_version += initials[piece_number] + ' '
    name_version += pieces[len(pieces)-1]
    alternatives.append(name_version)

    # all name initials with periods with last name
    name_version = ''
    for piece_number in range(0, len(pieces)-1):
        name_version += initials[piece_number] + '. '
    name_version += pieces[len(pieces)-1]
    alternatives.append(name_version)

    # all name initials concatenated with last name
    name_version = ''
    for piece_number in range(0, len(pieces)-1):
        name_version += initials[piece_number]
    name_version += ' ' + pieces[len(pieces)-1]
    alternatives.append(name_version)
    
    # remove duplicates
    dedupe = list(set(alternatives))

    return dedupe

def screen_qids(qids: List[str], screens: List[Dict[str, str]], default_language: str, user_agent: str) -> List[Dict[str, str]]:
    """Screen Q IDs based on criteria saved in the screens.yaml configuration file.
    
    Returns
    -------
    A list of dictionaries providing labels and descriptions that match the queried Q IDs.
    """
    qid_values =''
    for qid in qids:
        qid_values += 'wd:' + qid + '\n'

    graph_pattern = ''
    first_screen = True
    for screen in screens:
        # Each requirement in a screen has an AND relationship (all must be satisfied)
        subgraph_pattern = ''
        for requirement in screen:

            # Set the value if required or use a dummy variable if any value is allowed
            if requirement['entity'] is None:
                value = '?var' + requirement['property'] # add the property string to the variable to guarantee uniqueness
            elif re.match(r'Q\d+', requirement['entity']): # regex to match Q IDs
                value = 'wd:' + requirement['entity']
            else: # if not nothing or a Q ID, assume it's a string literal
                if requirement['lang'] is None:
                    value = '"' + requirement['entity'] + '"'
                else:
                    value = '"' + requirement['entity'] + '"@' + requirement['lang']

            # Set the property (label, description, or P value)
            if requirement['property'] == 'label':
                property = 'rdfs:label'
            elif requirement['property'] == 'description':
                property = 'schema:description'
            else:
                property = 'wdt:' + requirement['property']

            # Place the value in either the subject or object position in the triple
            if requirement['position'] == 'object':
                triple_pattern = '?qid ' + property + ' ' + value + '.'
            else:
                triple_pattern = value + ' ' + property + ' ?qid.'

            # Add filters if needed
            if requirement['filter_type'] == '<' or requirement['filter_type'] == '>': 
                # note: string comparison only e.g. for datetimes, needs modification for actual numbers
                triple_pattern += '\nFILTER (STR(?var' + requirement['property'] + ') ' + requirement['filter_type'] + ' "' + requirement['filter_string'] + '")'

            if requirement['filter_type'] == 'in': 
                # note: string comparison only
                triple_pattern += '\nFILTER (CONTAINS(?var' + requirement['property'] + ', "' + requirement['filter_string'] + '"))'

            # Use MINUS if you want to exclude items that fit the pattern.
            if requirement['require'] == 'exclude':
                triple_pattern = 'minus {' + triple_pattern + '}'

            triple_pattern += '\n'
            #print(triple_pattern)
            subgraph_pattern += triple_pattern

        # Now attach the subgraph pattern to any previous subgraph patterns using UNION to great an OR relationship
        subgraph_pattern = '{\n' + subgraph_pattern + '}\n' # create a subgraph pattern so that several can be UNIONed
        if first_screen: # The first subgraph pattern doesn't need the UNION inserted
            first_screen = False
        else:
            graph_pattern = graph_pattern + 'UNION\n'
        graph_pattern += subgraph_pattern 

    query_string = '''
    select distinct ?qid ?label ?description where {
      VALUES ?qid
      {
      ''' + qid_values + '''}
    ''' + graph_pattern + '''
    
    ?qid rdfs:label ?label.
    FILTER(lang(?label)="'''+ default_language + '''")
    
    OPTIONAL {
    ?qid schema:description ?description.
    FILTER(lang(?description)="'''+ default_language + '''")
    }
      }
    '''
    #print(query_string)
    
    wdqs = Sparqler(useragent=user_agent)
    results = wdqs.query(query_string)
    sleep(settings['sparql_sleep'])

    return_list = []
    for result in results:
        out_dict = {
            'qid': extract_local_name(result['qid']['value']),
            'label': result['label']['value']
            }
        if 'description' in result:
            out_dict['description'] = result['description']['value']
        else:
            out_dict['description'] = ''           
        return_list.append(out_dict)
    return return_list

def work_in_wikidata_status(label: str, doi: str, pmid: str, existing_works_df: pd.DataFrame, settings: Dict[str, Any], verbose: bool = False) -> Tuple[str, str]:
    """Search by DOI, PubMed ID, and label for a work in the list of pre-existing Wikidata items.
    
    Notes
    -----
    If a fuzzy match has a high score, accept as a match. 
    For intermediate range scores, flag as a case where one label is a subtitle of another.
    """
    if doi and doi.upper() in list(existing_works_df.loc[:, 'doi']):
        if verbose:
            print('DOI found in existing works')
        temp_series = existing_works_df.loc[existing_works_df['doi']==doi, 'qid'].copy()
        qid = temp_series.iloc[0]
        return 'found DOI', qid
    elif pmid and pmid in list(existing_works_df.loc[:, 'pmid']):
        if verbose:
            print('PubMed ID found in existing works')
        temp_series = existing_works_df.loc[existing_works_df['pmid']==pmid, 'qid'].copy()
        qid = temp_series.iloc[0]
        return 'found PubMed ID', qid
    else:
        # NOTE: although calculating the fuzz.WRatio is labor intensive, these checks must be done
        # sequentially, since we don't want the search for a nearly exact match to be stopped if a
        # stupid partial match is found first.
        for index, work in existing_works_df.iterrows():
            w_ratio = fuzz.WRatio(work['label'], label)

            # Test for nearly exact title match
            if w_ratio > settings['existing_work_fuzzy_match_cutoff']:
                if verbose:
                    print('fuzzy label match: ' + str(w_ratio))
                    print('test:', label)
                    print('wikidata:', extract_local_name(work['qid']), work['label'])
                return 'fuzzy label match', work['qid']
                
        for index, work in existing_works_df.iterrows():
            w_ratio = fuzz.WRatio(work['label'], label)
            # Test for meaningful subtitle match
            if w_ratio > settings['existing_work_subtitle_fuzzy_match_cutoff']:
                # NOTE: sometimes a work will have an acceptable score, because a long title matches a single
                # word title or vice versa. Those cases are nearly always bad matches and should not be
                # considered possible subtitle matches.
                if len(work['label'].split(' ')) != 1 and len(label.split(' ')) != 1:
                    if verbose:
                        print('Warning!!! Possible partial title: ' + str(w_ratio))
                        print('test:', label)
                        print('wikidata:', extract_local_name(work['qid']), work['label'])
                        logging.warning('Possible partial title: ' + str(w_ratio) + ' match, ' + extract_local_name(work['qid']) + ' ' + work['label'])
                    return 'possible partial title', work['qid']
                else: # keep looking for matches among existing works
                    print('skipped over possible partial title: ' + str(w_ratio) + ' match, ' + extract_local_name(work['qid']) + ' ' + work['label'])
                    continue

    if verbose:
        print('Not found')
    return 'not found', ''

def find_containing_book(isbn: str, label: str, default_language: str, user_agent: str) -> Tuple[bool, str]:
    """SPARQL query for a book by ISBN or label.
    
    Returns
    -------
    Tuple of
    First item a boolean, True if an error condition such as more than one book match or non-book type.
    Second item is the Q ID of the matching book or empty string if no match (not an error condition).
    """
    book_error = False
    found = False
    
    if isbn != '':
        query_string = '''SELECT DISTINCT ?item
    WHERE 
    {
      BIND ("''' + isbn + '''" AS ?isbn)
      {?item wdt:P212 ?isbn.} # ISBN-13
      union
      {?item wdt:P957 ?isbn.} # ISBN-10
    }
    '''
        wdqs = Sparqler(useragent=user_agent)
        results = wdqs.query(query_string)
        sleep(settings['sparql_sleep'])

        if len(results) > 1:
            print('More than one book matches the ISBN')
            logging.warning('More than one book matches the ISBN')
            book_error = True
            found = True
            book_qid = ''
        elif len(results) == 1:
            found = True
            book_qid = extract_local_name(results[0]['item']['value'])
            
    if not found:
        query_string = '''SELECT DISTINCT ?item ?type WHERE {
?item rdfs:label "''' + label + '"@' + default_language + '''.
?item wdt:P31 ?type.
}'''
        wdqs = Sparqler(useragent=user_agent)
        results = wdqs.query(query_string)
        sleep(settings['sparql_sleep'])
        if (results is None) or (len(results) == 0):
            book_qid = ''
        else:
            type_list = [extract_local_name(result['type']['value']) for result in results]
            qids_list = list(set([extract_local_name(result['item']['value']) for result in results]))
            if len(qids_list) > 1:
                book_error = True
                print('Label for published_in matches multiple items:', qids_list)
                logging.warning('Label for published_in matches multiple items: ' + str(qids_list))
                book_qid = ''
            else:
                result_qid = qids_list[0]
                if 'Q1711593' in type_list: # edited volume
                    book_qid = result_qid
                else:
                    book_error = True
                    print('Possible published_in', result_qid, 'not edited volume but has types', type_list)
                    logging.warning('Possible published_in ' + result_qid + ' not edited volume but has types ' + str(type_list))
                    book_qid = ''
    return book_error, book_qid

def create_skipped_dict(wikidata_status: str, work_data: pd.Series, mapping: Dict[str, Any]) -> Dict[str, str]:
    """Create a row dictionary for works whose processing is skipped for various reasons."""
    row_dict = {}
    row_dict['key'] = work_data[mapping['constants']['unique_identifier_column']]
    row_dict['reason'] = wikidata_status
    # look up remaining information from the works DataFrame
    row_dict['item_type'] = work_data[mapping['constants']['description_code_column']]
    # !!!! Idiosyncratic to Zotero dump
    if row_dict['item_type'] == 'bookSection':
        row_dict['isbn'] = work_data['parent_isbn']
    else:
        row_dict['isbn'] = work_data['ISBN']
    row_dict['issn'] = work_data['ISSN']
    row_dict['publication_title'] = work_data['Publication Title']
    row_dict['publication_year'] = work_data['Publication Year']
    row_dict['author'] = work_data['Author']
    row_dict['title'] = work_data['Title']
    return row_dict

# ------------------------
# SPARQL query class
# ------------------------

# This is a condensed version of the more full-featured script at 
# https://github.com/HeardLibrary/digital-scholarship/blob/master/code/wikidata/sparqler.py
# It includes only the method for the query form.

class Sparqler:
    
    def __init__(self, method: str = 'post', endpoint: str = 'https://query.wikidata.org/sparql', useragent: Optional[str] = None, sleep: float = 0.1):
        """Build SPARQL queries of various sorts

        Parameters
        -----------
        useragent: str
            Required if using the Wikidata Query Service, otherwise optional.
            Use the form: appname/v.v (URL; mailto:email@domain.com)
            See https://meta.wikimedia.org/wiki/User-Agent_policy
        endpoint: URL
            Defaults to Wikidata Query Service if not provided.
        method: str
            Possible values are "post" (default) or "get". Use "get" if read-only query endpoint.
            Must be "post" for update endpoint.
        sleep: float
            Number of seconds to wait between queries. Defaults to 0.1

        Required modules
        ----------------
        requests, datetime, time
        """
        # attributes for all methods
        self.http_method = method
        self.endpoint = endpoint
        if useragent is None:
            if self.endpoint == 'https://query.wikidata.org/sparql':
                print('You must provide a value for the useragent argument when using the Wikidata Query Service.')
                print()
                raise KeyboardInterrupt # Use keyboard interrupt instead of sys.exit() because it works in Jupyter notebooks
        self.sleep = sleep

        self.requestheader = {}
        if useragent:
            self.requestheader['User-Agent'] = useragent
        
        if self.http_method == 'post':
            self.requestheader['Content-Type'] = 'application/x-www-form-urlencoded'

    def query(self, query_string: str, form: str = 'select', verbose: bool = False, **kwargs):
        """Send a SPARQL query to the endpoint.
        
        Parameters
        ----------
        form : str
            The SPARQL query form.
            Possible values are: "select" (default), "ask", "construct", and "describe".
        mediatype: str
            The response media type (MIME type) of the query results.
            Some possible values for "select" and "ask" are: "application/sparql-results+json" (default) and "application/sparql-results+xml".
            Some possible values for "construct" and "describe" are: "text/turtle" (default) and "application/rdf+xml".
            See https://docs.aws.amazon.com/neptune/latest/userguide/sparql-media-type-support.html#sparql-serialization-formats-neptune-output
            for response serializations supported by Neptune.
        verbose: bool
            Prints status when True. Defaults to False.
        default: list of str
            The graphs to be merged to form the default graph. List items must be URIs in string form.
            If omitted, no graphs will be specified and default graph composition will be controlled by FROM clauses
            in the query itself. 
            See https://www.w3.org/TR/sparql11-query/#namedGraphs and https://www.w3.org/TR/sparql11-protocol/#dataset
            for details.
        named: list of str
            Graphs that may be specified by IRI in a query. List items must be URIs in string form.
            If omitted, named graphs will be specified by FROM NAMED clauses in the query itself.
            
        Returns
        -------
        If the form is "select" and mediatype is "application/json", a list of dictionaries containing the data.
        If the form is "ask" and mediatype is "application/json", a boolean is returned.
        If the mediatype is "application/json" and an error occurs, None is returned.
        For other forms and mediatypes, the raw output is returned.

        Notes
        -----
        To get UTF-8 text in the SPARQL queries to work properly, send URL-encoded text rather than raw text.
        That is done automatically by the requests module for GET. I guess it also does it for POST when the
        data are sent as a dict with the urlencoded header. 
        See SPARQL 1.1 protocol notes at https://www.w3.org/TR/sparql11-protocol/#query-operation        
        """
        query_form = form
        if 'mediatype' in kwargs:
            media_type = kwargs['mediatype']
        else:
            if query_form == 'construct' or query_form == 'describe':
            #if query_form == 'construct':
                media_type = 'text/turtle'
            else:
                media_type = 'application/sparql-results+json' # default for SELECT and ASK query forms
        self.requestheader['Accept'] = media_type
            
        # Build the payload dictionary (query and graph data) to be sent to the endpoint
        payload = {'query' : query_string}
        if 'default' in kwargs:
            payload['default-graph-uri'] = kwargs['default']
        
        if 'named' in kwargs:
            payload['named-graph-uri'] = kwargs['named']

        if verbose:
            print('querying SPARQL endpoint')

        start_time = datetime.now()
        if self.http_method == 'post':
            response = requests.post(self.endpoint, data=payload, headers=self.requestheader)
        else:
            response = requests.get(self.endpoint, params=payload, headers=self.requestheader)
        #print('from cache:', response.from_cache) # uncomment if you want to see if cached data are used
        elapsed_time = (datetime.now() - start_time).total_seconds()
        self.response = response.text
        sleep(self.sleep) # Throttle as a courtesy to avoid hitting the endpoint too fast.

        if verbose:
            print('done retrieving data in', int(elapsed_time), 's')

        if query_form == 'construct' or query_form == 'describe':
            return response.text
        else:
            if media_type != 'application/sparql-results+json':
                return response.text
            else:
                try:
                    data = response.json()
                except:
                    return None # Returns no value if an error. 

                if query_form == 'select':
                    # Extract the values from the response JSON
                    results = data['results']['bindings']
                else:
                    results = data['boolean'] # True or False result from ASK query 
                return results           

# ---------------------------
# Major processes functions
# ---------------------------

def evaluate_function(work_data: pd.Series, column_map: Dict[str, str], settings: Dict[str, Any]) -> Any:
    """Create argument list and pass to the function specified in the column mapping data."""
    args_list = []
    if 'in_col_label' in column_map:
        in_value = work_data[column_map['in_col_label']]
        args_list.append(in_value)
    # If use_settings key is present, it must have a True value to pass the settings
    if 'use_settings' in column_map and column_map['use_settings']:
        args_list.append(settings)
    # Add any necessary globally defined reference DataFrames to the argument list
    if 'reference_dfs' in column_map:
        for df_name in column_map['reference_dfs']:
            args_list.append(VARIABLES[df_name.upper()])
    args = tuple(args_list)

    my_function = MODULE_FUNCTIONS[column_map['mapping_function']]
    return my_function(*args)

def extract_metadata(mapping: Dict[str, Any], work_data: pd.Series, settings: Dict[str, Any]) -> Dict[str, str]:
    """Step through output fields, map them to source data columns, and transform input data for output row.
    
    Parameters
    ----------
    mapping: complex structure
        Maps column headers ("out_col_label") in the destination table to column headers ("in_col_label") 
        in the source table.
        The "mapping_function" key indicates the function used to determine the value to be used in the 
        output row of the destination table.
    work_data: pd.Series
        A row of data from the source data table with column headers as the keys.
    settings: complex structure
        Configuration data
    """
    out_dict = {'qid': '', 'unique_identifier': work_data[mapping['constants']['unique_identifier_column']]}
    out_dict['label_' + settings['default_language']] = work_data[mapping['constants']['label_column']]
    out_dict['description_' + settings['default_language']] = set_description(work_data[mapping['constants']['description_code_column']], settings['work_types'])

    for out_property in config['outfiles'][0]['prop_list']:
        
        # Find the mapping variable that matches the config property
        for prop in mapping['properties']:
            if prop['out_col_label'] == out_property['variable']:
                break
    
        out_field = out_property['variable']
        out_dict[out_field + '_uuid'] = ''
        
        # If a function requires some data structure for input, its mapping must include the string that is 
        # the name of the data structure object needed by the function as an item in the reference_dfs list.
        
        # Functions not needing additional data will have
        # only three arguments and None will be passed into the constructor functions as the data_structure argument.
        
        # NOTE: the data structure will be a global variable and be defined in the main script.
        if 'structure_name_string' in prop:
            data_structure = eval(prop['structure_name_string'])
        else:
            data_structure = None

        output_value = evaluate_function(work_data, prop, settings)
        if output_value == '':
            no_value = True
        else:
            no_value = False

        # Populate the values-related columns
        if out_property['value_type'] == 'date':
            out_dict[out_field + '_nodeId'] = ''
            out_dict[out_field + '_val'] = output_value
            out_dict[out_field + '_prec'] = ''

        elif out_property['value_type'] == 'quantity':
            out_dict[out_field + '_nodeId'] = ''
            out_dict[out_field + '_val'] = output_value
            if no_value:
                out_dict[out_field + '_unit'] = ''
            else:
                out_dict[out_field + '_unit'] = prop['quantity_unit']

        # This is not actually implemented and will generate an error if used
        elif out_property['value_type'] == 'globecoordinate':
            out_dict[out_field + '_nodeId'] = ''
            out_dict[out_field + '_val'] = output_value
            if no_value:
                out_dict[out_field + '_long'] = ''
                out_dict[out_field + '_prec'] = ''
            else:
                out_dict[out_field + '_long'] = work_data[out_field + '_long']
                out_dict[out_field + '_prec'] = work_data[out_field + '_prec']

        else:
            out_dict[out_field] = output_value

        # Populate the qualifier columns
        for qualifier in out_property['qual']:
            if no_value:
                qual_output_value = ''
            else:
                # Find the mapping variable that matches the config property
                for qual in prop['qual']:
                    if qual['out_col_label'] == qualifier['variable']:
                        break
                        
                qual_output_value = evaluate_function(work_data, qual, settings)

            qual_field = out_field + '_' + qualifier['variable']
            # To my knowledge, dates are the only complex types used as qualifiers (no quantities or globecoordinates).
            if qualifier['value_type'] == 'date':
                out_dict[qual_field + '_nodeId'] = ''
                out_dict[qual_field + '_val'] = qual_output_value
                out_dict[qual_field + '_prec'] = ''
            else:
                out_dict[qual_field] = qual_output_value
                
        # Populate the reference columns
        # There's only a hash ID column if there's at least one reference.
        if len(out_property['ref']) > 0:
            out_dict[out_field + '_ref1_hash'] = ''
            
        for reference in out_property['ref']:
            if no_value:
                ref_output_value = ''
            else:
                # Find the mapping variable that matches the config property
                for ref in prop['ref']:
                    if ref['out_col_label'] == reference['variable']:
                        break

                ref_output_value = evaluate_function(work_data, ref, settings)

            ref_field = out_field + '_ref1_' + reference['variable']
            # To my knowledge, dates are the only complex types used as qualifiers (no quantities or globecoordinates).
            if reference['value_type'] == 'date':
                out_dict[ref_field + '_nodeId'] = ''
                out_dict[ref_field + '_val'] = ref_output_value
                out_dict[ref_field + '_prec'] = ''
            else:
                out_dict[ref_field] = ref_output_value
                    
    #print(out_dict)
    return out_dict

def disambiguate_agents(authors: List[Dict[str, str]], pmid: str, coauthors: pd.DataFrame, settings: Dict[str, Any], user_agent: str) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
    """Find possible Wikidata Q ID matches from agent strings.

    Returns
    -------
    When a positive ID is made, returns Q IDs, series ordinal, stated_as to use for author/editor/translator
    statements.
    When no positive ID is made, a list of possible matches is also included for each author string.
    
    Notes
    -----
    Use a wide variety of data and tricks to come up with possible matches.
    This includes fuzzy matching against department names and querying Wikidata labels and aliases with
    many variations of the name string.
    """
    max_pmids_to_check = 10
    # If there is a PubMed ID for the article, retrieve the author info
    if pmid != '':
        pubmed_author_info = retrieve_pubmed_data(pmid)
        print('retrieved data from PubMed ID', pmid)
        for author_index in range(len(pubmed_author_info)):
            pubmed_author_info[author_index]['name'] = pubmed_author_info[author_index]['forename'] + ' ' + pubmed_author_info[author_index]['surname']
    else:
        print('no PubMed data')

    # Augment CrossRef data with PubMed data. Typically the PubMed data is more likely to have the affiliations
    # Names are generally very similar, but vary with added or missing periods on initials and suffixes
    if pmid != '':
        for author_index in range(len(authors)):
            found = False
            crossref_name = authors[author_index]['givenName'] + ' ' + authors[author_index]['familyName']
            #print(crossref_name)
            for pubmed_author in pubmed_author_info:
                ratio = fuzz.ratio(pubmed_author['name'], crossref_name)
                #print(ratio, pubmed_author['name'])
                if ratio > 87: # had to drop down to this level because some people with missing "Jr" weren't matching
                    found = True
                    result_string = 'fuzzy label match: ' + str(ratio) + pubmed_author['name'] + ' / ' + crossref_name
                    #print(result_string)
                    break
            if not found:
                print('Did not find a match in the PubMed data for', crossref_name)
            else:
                #print(pubmed_author)
                #print(authors[author_index])

                # If there is a PubMed affiliation and no affiliation in the CrossRef data, add the PubMed affiliation
                if pubmed_author['affiliation'] != '':
                    if len(authors[author_index]['affiliation']) == 0:
                        authors[author_index]['affiliation'].append(pubmed_author['affiliation'])

                # If there is an ORCID in PubMed and no ORCID in the CrossRef data, add the ORCID to CrossRef data
                # Not sure how often this happens since I think maybe usually of one has it, the other does, too.
                if pubmed_author['orcid'] != '':
                    if authors[author_index]['orcid'] == '':
                        authors[author_index]['orcid'] = pubmed_author['orcid']

                #print(authors[author_index])

            #print()
    #print(json.dumps(pubmed_author_info, indent=2))

    # Perform screening operations on authors to try to determine their Q IDs
    found_qid_values = []
    not_found_author_list = []
    author_count = 1
    for author in authors:
        print(author_count)
        found = False
        
        # First eliminate the case where all of the name pieces are empty
        if (author['givenName'] + ' ' + author['familyName']).strip() == '':
            break
            
        # Record stated_as
        stated_as = (author['givenName'] + ' ' + author['familyName']).strip()
            
        # Fix case where names are stupidly in all caps
        name_pieces = author['givenName'].strip().split(' ')
        author['givenName'] = ' '.join(fix_all_caps(name_pieces))
        name_pieces = author['familyName'].strip().split(' ')
        author['familyName'] = ' '.join(fix_all_caps(name_pieces))
        
        # Screen for exact match to Wikidata labels
        for index, researcher in RESEARCHERS.iterrows():
            if researcher['label_en'] == author['givenName'] + ' ' + author['familyName']:
                found = True
                result_string = 'researcher exact label match: ' + researcher['qid'] + ' ' + researcher['label_en']
                name = researcher['label_en']
                qid = researcher['qid']
                break
        if not found:
            # screen for exact match to alternate names
            for index, altname in ALTNAMES.iterrows():
                if altname['altLabel'] == author['givenName'] + ' ' + author['familyName']:
                    found = True
                    result_string = 'researcher altname match: ' + altname['qid'] + ' ' + altname['altLabel']
                    name = altname['altLabel']
                    qid = altname['qid']
                    break
            if not found:
                # If the researcher has an ORCID, see if it's at Wikidata
                if author['orcid'] != '':
                    hit = searchWikidataForQIdByOrcid(author['orcid'])
                    if hit != {}:
                        found = True
                        result_string = 'Wikidata ORCID search: ' + hit['qid'] + ' ' + hit['label'] + ' / ' + hit['description']
                        name = hit['label']
                        qid = hit['qid']

                if not found:
                    # screen for fuzzy match to Wikidata-derived labels
                    for index, researcher in RESEARCHERS.iterrows():
                        # Require the surname to match the label surname exactly
                        split_names = find_surname_givens(researcher['label_en']) # returns False if no family name
                        if split_names: # skip names that don't have 2 parts !!! also misses non-English labels!
                            if split_names['family'] == author['familyName']: # require exact match to family name
                                w_ratio = fuzz.ratio(author['givenName'] + ' ' + author['familyName'], researcher['label_en'])
                                #w_ratio = fuzz.WRatio(author['givenName'] + ' ' + author['familyName'], researcher['label_en'])
                                if w_ratio > 90:
                                    found = True
                                    result_string = 'fuzzy label match: ' + str(w_ratio) + ' ' + researcher['qid'] + ' ' + researcher['label_en'] + ' / ' + author['givenName'] + ' ' + author['familyName']
                                    name = researcher['label_en']
                                    qid = researcher['qid']
                                    break
                    if not found:
                        # screen for fuzzy match to alternate names
                        for index, altname in ALTNAMES.iterrows():
                            split_names = find_surname_givens(altname['altLabel'])
                            if split_names: # skip names that don't have 2 parts
                                if split_names['family'] == author['familyName']: # require exact match to family name
                                    w_ratio = fuzz.ratio(author['givenName'] + ' ' + author['familyName'], altname['altLabel'])
                                    #w_ratio = fuzz.WRatio(author['givenName'] + ' ' + author['familyName'], altname['altLabel'])
                                    if w_ratio > 90:
                                        found = True
                                        result_string = 'researcher altname fuzzy match: ' + str(w_ratio) + ' ' + altname['qid'] + ' ' + altname['altLabel'] + ' / ' + author['givenName'] + ' ' + author['familyName']
                                        name = altname['altLabel']
                                        qid = altname['qid']
                                        break
                        if not found:
                            name = author['givenName'] + ' ' + author['familyName']
                            print('Searching Wikidata for', name)
                            print('researcher known affiliations: ', author['affiliation'])
                            print()
                            hits = search_name_at_wikidata(name, user_agent)
                            #print(hits)

                            qids = []
                            for hit in hits:
                                qids.append(hit['qid'])
                            return_list = screen_qids(qids, screens, settings['default_language'], user_agent) # screens is a global variable loaded at the start
                            #print(return_list)

                            for hit in return_list:
                                # Check each possible name match to the list of known co-authors/co-editors
                                # If there is a match, then use that Q ID and quit trying to match.
                                if hit['qid'] in list(coauthors.index):
                                    found = True
                                    qid = hit['qid']
                                    result_string = 'Match with known coauthor'
                                    
                            if not found:
                                # Save discovered data to return if not matched
                                discovered_data = []
                                for hit in return_list:                                
                                    hit_data = hit
                                    split_names = find_surname_givens(hit['label'])

                                    # Require the surname to match the Wikidata label surname exactly
                                    # This prevents a high fraction of fuzzy matches where the last names are similar but not the same
                                    if split_names: # skip names that don't have 2 parts
                                        if split_names['family'] == author['familyName']: # require exact match to family name
                                            #print(author['givenName'] + ' ' + author['familyName'], hit['label'])
                                            #print(hit)
                                            w_ratio = fuzz.WRatio(author['givenName'] + ' ' + author['familyName'], hit['label'])
                                            #print('w_ratio:', w_ratio)
                                            #ratio = fuzz.ratio(author['givenName'] + ' ' + author['familyName'], hit['label'])
                                            #print('ratio:', ratio)
                                            #partial_ratio = fuzz.partial_ratio(author['givenName'] + ' ' + author['familyName'], hit['label'])
                                            #print('partial_ratio:', partial_ratio)
                                            #token_sort_ratio = fuzz.token_sort_ratio(author['givenName'] + ' ' + author['familyName'], hit['label'])
                                            #print('token_sort_ratio:', token_sort_ratio)
                                            #token_set_ratio = fuzz.token_set_ratio(author['givenName'] + ' ' + author['familyName'], hit['label'])
                                            #print('token_set_ratio:', token_set_ratio)

                                            # This screen requires a high degree of similarity between the overall ORCID names and Wikidata labels
                                            if w_ratio > 80:
                                                print('Wikidata search fuzzy match:', w_ratio, author['givenName'] + ' ' + author['familyName'], ' / ', 'https://www.wikidata.org/wiki/'+ hit['qid'], hit['label'])
                                                print('Wikidata description: ', hit['description'])

                                                # Here we need to check Wikidata employer and affiliation and fuzzy match against known affiliations
                                                occupations, employers, affiliations = search_wikidata_occ_emp_aff(hit['qid'], settings['default_language'], user_agent)
                                                print('occupations:', occupations)
                                                hit_data['occupations'] = occupations
                                                print('employers:', employers)
                                                hit_data['employers'] = employers
                                                print('affiliations', affiliations)
                                                hit_data['affiliations'] = affiliations
                                                print()

                                                # Perform a check of the employer to make sure we didn't miss somebody in the earlier
                                                # string matching
                                                for employer in employers:
                                                    if 'Vanderbilt University' in employer: # catch university and med center
                                                        found = True
                                                        result_string = 'Match Vanderbilt employer in Wikidata: ' + hit['qid'] + ' ' + author['givenName'] + ' ' + author['familyName']
                                                        qid = hit['qid']

                                                # If the author doesn't have any known affiliations, there is no point in checking PubMed
                                                if author['affiliation'] != []:
                                                    # Search Wikidata for articles written by this match
                                                    articles_in_wikidata = search_wikidata_article(hit['qid'])
                                                    #print(articles_in_wikidata)

                                                    # Step through articles with PubMed IDs found in Wikidata and see if the author affiliation or ORCID matches any of the articles
                                                    check = 0
                                                    for article_in_wikidata in articles_in_wikidata:
                                                        if article_in_wikidata['pmid'] != '':
                                                            check += 1
                                                            if check > max_pmids_to_check:
                                                                print('More articles, but stopping after checking', max_pmids_to_check)
                                                                break # break out of article-checking loop
                                                            print('Checking article, PMID:', article_in_wikidata['pmid'], article_in_wikidata['title'])
                                                            pubmed_match = identified_in_pubmed(article_in_wikidata['pmid'], author['givenName'] + ' ' + author['familyName'], author['affiliation'], author['orcid'])
                                                            if not pubmed_match:
                                                                #print('no match')
                                                                print()
                                                            else:
                                                                found = True
                                                                result_string = 'PubMed affilation match: ' + hit['qid'] + ' ' + author['givenName'] + ' ' + author['familyName']
                                                                qid = hit['qid']
                                                                break # break out of article-checking loop

                                                if found:
                                                    break # break out of hit list loop
                                                print()
                                                # If none of the matching criteria are met, save the data for future use
                                                discovered_data.append(hit_data)

        if not found:
            not_found_author_list.append({'name_string': author['givenName'] + ' ' + author['familyName'], 'series_ordinal': author_count, 'possible_matches': discovered_data})
            print('not found:', author['givenName'] + ' ' + author['familyName'])

        else:
            found_qid_values.append({'qid': qid, 'stated_as': stated_as, 'series_ordinal': author_count})
            print(result_string)
            for index, department in DEPARTMENTS.iterrows():
                if qid == department['qid']:
                    for lindex, department_label in DEPARTMENT_LABELS.iterrows():
                        if department_label['qid'] == department['affiliation']:
                            print(department_label['label_en'])
                            break
        print()
        author_count += 1

    print()
    return found_qid_values, not_found_author_list


# Accounting

We would like to know two things:
1. What publications are in Wikidata that we didn't put there? We might potentially want to monitor their metadata. (Categories B + C below)
2. What publications from the Zotero database dump are not yet in Wikidata? We cannot assume that it's all the ones we didn't put in, because they may have been uploaded by others. (Category D below)

![categories diagram](venn_diagram.png)

## Categories

| description | name | in Zotero | in Wikidata | uploaded by us |
| ----------- | -------- | --------- | ----------- | -------------- |
| A. uploaded to Wikidata by us | works | x | x | x |
| B. Wikidata in Zotero not by us | wd_zo_not_us | x | x |  |
| C. Wikidata not Zotero | wd_not_zo |  | x |  |
| D. In Zotero but not in Wikidata | not_wikidata | x |  |  |

## Calculation methods

**Uploaded by us (A)**

already present in works.csv; have qids and Zotero IDs

1030 works


**All publications in Wikidata not by us (B + C)** 

existing_works_in_wikidat.csv minus works -> wd_not_us

Use qid as unique identifier

988 works


**Zotero pubs in Wikidata, but not by us (B)**

wd_not_us intersection divinity_publications_2022-12-01.csv -> wd_zo_not_us

Must be done by fuzzy string matching since no common identifier. Assign a qid (already have Zotero ID).

567 works


**Wikidata pubs not in Zotero (C)**

existing_works_in_wikidata.csv minus { works.csv (A) union wd_zo_not_us (B) } -> wd_not_zo

Remove from existing works all that now have a Zotero ID.

423 works

**Zotero pubs not in Wikidata (D)**

divinity_publications_2022-12-01.csv minus { works.csv (A) union wd_zo_not_us (B) } -> not_wikidata.csv

Remove from div pubs all that have qids 

4183 works


In [None]:
# Load program settings, mappings, screens, and configurations
with open('settings.yaml', 'r') as file_object:
    settings = yaml.safe_load(file_object)

# Load the source file from the Zotero export
zotero = pd.read_csv(settings['data_file_path'] + settings['source_data_filename'], na_filter=False, dtype = str)
zotero = zotero.set_index('Key')

# Load the works uploaded by us
works = pd.read_csv(settings['data_file_path'] + 'works.csv', na_filter=False, dtype = str)
works = works.set_index('qid')

# Load the works known to be in Wikidata (via SPARQL query)
existing_works_in_wikidata = pd.read_csv(settings['temporary_files_path'] + 'existing_works_in_wikidata.csv', na_filter=False, dtype = str)
existing_works_in_wikidata = existing_works_in_wikidata.set_index('qid')


In [None]:
#zotero = zotero.iloc[170:180]
zotero


In [None]:
ex_wo_qids = list(existing_works_in_wikidata.index.values)
print('existing works', len(ex_wo_qids))
print('non-redundant', len(list(set(ex_wo_qids))))
existing_works_in_wikidata

In [None]:
# Publications uploaded by us (A)
wo_up_qids = list(works.index.values)
print('existing works', len(wo_up_qids))
print('non-redundant', len(list(set(wo_up_qids))))
works

Later:

10 works that we have on record as having uploaded aren't being found as works in WD. 

1. In some cases these were probably merged as duplicates of works already in Wikidata, so they should have been deleted from the works uploaded list. (Found one of these and deleted.)
2. In other cases, there might have been duplicates within the Zotero database itself (with one note yet uploaded, for example). They should have been deleted from the Zotero database list, though.
3. Another possibility is that we may have uploaded them but none of their agents were actually linked as Vanderbilt faculty.


In [None]:
# Find all publications in Wikidata not by us (B + C)
# existing_works_in_wikidata minus works -> wd_not_us
# Use qid as unique identifier
works_qids = list(works.index.values)
print('uploaded', len(works_qids))
print('non-redundant', len(list(set(works_qids))))
print('difference', len(ex_wo_qids)-len(works_qids))
wd_not_us = existing_works_in_wikidata.loc[~existing_works_in_wikidata.index.isin(works_qids)].copy()
wd_not_us = wd_not_us.reset_index()
wd_not_us

In [None]:
# Zotero pubs in Wikidata, but not by us (B)
# wd_not_us intersection divinity_publications_2022-12-01.csv -> wd_zo_not_us
# Must be done by fuzzy string matching since no common identifier. Assign a qid (already have ).

# Create an empty DataFrame to add 
wd_zo_not_us = pd.DataFrame()

count = 0
for index, work_data in zotero.iterrows():
    count += 1
    print(count, index, work_data['Title'])
    doi = mapping_functions.clean_doi(work_data['DOI'])
    pmid = mapping_functions.extract_pmid_from_extra(work_data['Extra'])
    wikidata_status, qid = work_in_wikidata_status(work_data['Title'], doi, pmid, wd_not_us, settings, verbose=True)
    
    # Try to match agents if partial title or if a single word label match
    if wikidata_status == 'possible partial title' or (wikidata_status == 'fuzzy label match' and len(work_data['Title'].strip().split(' '))==1):
        
        # Extract the agent family names from the agents in the Zotero output
        agent_family_names = []
        for agent_type in mapping_agents['sources']:
            agents = mapping_functions.extract_names_from_list(work_data[agent_type['in_col_label']], settings)
            for agent in agents:
                agent_family_names.append(agent['familyName'])
                
        # Query Wikidata to get all of the agent names for the possibly matching work
        wdqs = Sparqler(useragent=user_agent)
        query_string = '''select distinct ?agentLabel where {
  BIND (wd:''' + qid + ''' AS ?work)
  {?work wdt:P50 ?author.
   ?author rdfs:label ?agentLabel.}
  union
  {?work wdt:P98 ?editor.
   ?editor rdfs:label ?agentLabel.}
  union
  {?work wdt:P655 ?translator.
   ?translator rdfs:label ?agentLabel.}
  filter(lang(?agentLabel) = 'en')
  }'''
        query_results = wdqs.query(query_string)
        sleep(settings['sparql_sleep'])

        # Check each name from Wikidata to see if any of the Zotero family names is within the Wikidata name
        found = False
        for result in query_results:
            name = result['agentLabel']['value']
            for agent_family_name in agent_family_names:
                if agent_family_name in name:
                    found = True
                    wikidata_status = 'good partial title with name match ' + name
                    break
            if found:
                break
        if not found:
            wikidata_status = 'bad partial title no agent name match'
            qid = '' 

    print(wikidata_status, qid)
    print()
    
    # 
    if qid != '':
        zotero.loc[index, 'qid'] = qid # Add the Q ID to a new column in the Zotero DataFrame
        work_data['Key'] = index # Add index ('Key') back into the row series so it doesn't get lost
        work_data['qid'] = qid # Add the Q ID to the row series
        wd_zo_not_us= wd_zo_not_us.append(work_data, ignore_index=True) # Add the modified row to the new DF
    else:
        zotero.loc[index, 'qid'] = ''

wd_zo_not_us.to_csv(settings['data_file_path'] + 'wd_zo_not_us.csv', index = False)
zotero.to_csv(settings['data_file_path'] + 'zotero.csv')

print('done')

## Important note:

After this step, the results need to be checked carefully. In some cases, the original Zotero output had duplicates and those need to be de-duplicated, both in the results and the original data.

In other cases, there were errors that resulted in multiple works being matched to the same Q ID. Those needed to be resolved as well.

In [None]:
zotero = pd.read_csv(settings['data_file_path'] + 'zotero.csv', na_filter=False, dtype = str)
print(len(zotero))

wd_zo_not_us = pd.read_csv(settings['data_file_path'] + 'wd_zo_not_us.csv', na_filter=False, dtype = str)
print(len(wd_zo_not_us))

In [None]:
# Wikidata pubs not in Zotero (C)
# existing_works_in_wikidat.csv minus { works.csv (A) union wd_zo_not_us (B) } -> wd_not_zo
# Remove from existing works all that now have a Zotero ID.

# Get the union of Zotero pubs in Wikidata
works_qids_list = list(works.index.values)
print('works_qids_list (A)', len(works_qids_list))
print('unique A', len(list(set(works_qids_list))))
wd_zo_not_us_qids_list = list(wd_zo_not_us['qid'])
print('wd_zo_not_us_qids_list (B)', len(wd_zo_not_us_qids_list))
print('unique B', len(list(set(wd_zo_not_us_qids_list))))
print('B+C', len(wd_not_us))
print('A + B', len(works_qids_list + wd_zo_not_us_qids_list))
zotero_pubs_in_wikidata_list = list(set(works_qids_list + wd_zo_not_us_qids_list))

a_plus_b = works_qids_list + wd_zo_not_us_qids_list
duplicates = [qid for qid in a_plus_b if a_plus_b.count(qid) > 1]
unique_duplicates = list(set(duplicates))
print(unique_duplicates)

print('non-redudundant zotero_pubs_in_wikidata_list (A+B)', len(zotero_pubs_in_wikidata_list))
#print(zotero_pubs_in_wikidata_list)

# Remove those pubs from the total list of works in Wikidata
print('existing_works_in_wikidata', len(existing_works_in_wikidata))
wd_not_zo = existing_works_in_wikidata.loc[~existing_works_in_wikidata.index.isin(zotero_pubs_in_wikidata_list)].copy()
print('wd_not_zo (C)', len(wd_not_zo))
wd_not_zo

In [None]:
# Zotero pubs not in Wikidata (D)

# divinity_publications_2022-12-01.csv minus { works.csv (A) union wd_zo_not_us (B) } -> not_wikidata.csv
# Alternatively: remove from div pubs all that have qids 
works_key_list = list(works['unique_identifier'])
print(len(works_key_list))
wd_zo_not_us_key_list = list(wd_zo_not_us['Key'])
print(len(wd_zo_not_us_key_list))

not_wikidata = zotero.loc[~zotero.index.isin(works_key_list + wd_zo_not_us_key_list)].copy()
print('not_wikidata (D)', len(not_wikidata))
not_wikidata.to_csv(settings['data_file_path'] + 'not_wikidata.csv')