In [None]:
# publoader.ipynb, a Python script for uploading files and data to Wikimedia Commons using the API.

# (c) 2022 Vanderbilt University. This program is released under a GNU General Public License v3.0 http://www.gnu.org/licenses/gpl-3.0
# Author: Steve Baskauf

# ----------------
# Global variables
# ----------------

script_version = '0.0.1'
default_language = 'en'
precision_cutoff = 0.95
phrase_length_cutoff = 2
sparql_sleep = 0.1 # minimal delay between SPARQL queries
names_separator = ';'
name_part_separator = ',' # Set to empty string if names aren't reversed

# The user_agent string identifies this application to Wikimedia APIs.
# If you modify this script, you need to change the user-agent string to something else!
user_agent = 'PubLoader/' + script_version + ' (mailto:steve.baskauf@vanderbilt.edu)'

# !!! Need to set up an error log!

# ----------------
# Module imports
# ----------------

import yaml
from datetime import datetime
from time import sleep
import json
import pandas as pd
import requests
from fuzzywuzzy import fuzz # fuzzy logic matching
from langdetect import detect_langs
import re # regex

full_works = pd.read_csv('full_work_div_pub.csv', na_filter=False, dtype = str)
full_works = full_works.set_index('Title')

language_qid = {
    'en': 'Q1860',
    'de': 'Q188',
    'fr': 'Q150',
    'es': 'Q1321',
    'it': 'Q652',
    'nl': 'Q7411',
    'zh': 'Q7850',
    'no': 'Q9043',
    'ar': 'Q13955',
    'he': 'Q9288',
    'pt': 'Q5146'
}

"""
# List of known work types used by CrossRef
work_types = [
    {
    'type_string': 'journal-article',
    'qid': 'Q18918145', # academic journal article, alternatively Q13442814 scholarly article
    'description': 'journal article'
    },
    {
    'type_string': 'book',
    'qid': 'Q3331189', # "version, edition, or translation"
    'description': 'book'
    },
    {
    'type_string': 'book-chapter',
    'qid': 'Q21481766', # "academic chapter"
    'description': 'academic book chapter'
    },
    {
    'type_string': 'monograph',
    'qid': 'Q193495', # monograph
    'description': 'monograph'
    },
    {
    'type_string': 'reference-book',
    'qid': 'Q5292', # encyclopedia
    'description': 'encyclopedia'
    },
    {
    'type_string': 'reference-entry',
    'qid': 'Q13433827', # encyclopedia article, some are handbook articles
    'description': 'encyclopedia article'
    },
    {
    'type_string': 'dataset',
    'qid': 'Q13433827', # also used for encyclopedia articles
    'description': 'encyclopedia article'
    },
    {
    'type_string': 'other',
    'qid': 'Q55915575', # 
    'description': 'scholarly work'
    }
]
"""

# List of known work types used by Zotero
work_types = [
    {
    'type_string': 'journalArticle',
    'qid': 'Q18918145', # academic journal article, alternatively Q13442814 scholarly article
    'description': 'journal article'
    },
    {
    'type_string': 'book',
    'qid': 'Q3331189', # "version, edition, or translation"
    'description': 'book'
    },
    {
    'type_string': 'bookSection',
    'qid': 'Q21481766', # "academic chapter"
    'description': 'academic book chapter'
    }
]

url_exclusion_strings = [
    'login',
    'proxy',
    #'search.proquest.com',
    'worldcat',
    'wp-content',
    'site.ebrary.com',
    'cro3.org/',
    'worldbookonline.com/pl/infofinder'
]

url_inclusion_strings = [
    'doi',
    'jstor',
    #'oxfordjournals.org/content',
    'article',
    'academia.edu',
    'content',
    'proquest.com/docview',
    'handle'
]

# ------------------------
# Utility functions
# ------------------------

def extract_local_name(iri):
    """Extract the local name part of an IRI, e.g. a Q ID from a Wikidata IRI"""
    # pattern is http://www.wikidata.org/entity/Q6386232
    pieces = iri.split('/')
    return pieces[-1]

url_pattern = "^https?:\\/\\/(?:www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b(?:[-a-zA-Z0-9()@:%_\\+.~#?&\\/=]*)$"

def include_reference_url(url):
    """Returned strings are suitable to use for references. Currently it's assumed that the criteria are the
    same for full work available."""
    url = url.lower() # convert to all lowercase
    
    # Exclude invalid URLs
    if re.match(url_pattern, url) is None:
        return ''

    # If the URL matches one of the pre-screened URLs, use it
    matched_series = full_works.loc[full_works['Url']==url, 'Url']
    # matched_series will be a Series composed of all values in the Url column that match. There should be 1 or 0.
    if len(matched_series) == 1:
        return url
    
    # Exclude any URLs containing strings that indicate a login is required
    for screening_string in url_exclusion_strings:
        if screening_string in url:
            return ''
        
    # Must contain one of the strings that indicate metadata and possible acces
    for screening_string in url_inclusion_strings:
        if screening_string in url:
            return url
        
    return ''

def roman_integer_value(r):
    """Return value of Roman numeral symbol.
    
    Note:
    -----
    Code from https://www.geeksforgeeks.org/python-program-for-converting-roman-numerals-to-decimal-lying-between-1-to-3999/"""    
    if (r == 'I'):
        return 1
    if (r == 'V'):
        return 5
    if (r == 'X'):
        return 10
    if (r == 'L'):
        return 50
    if (r == 'C'):
        return 100
    if (r == 'D'):
        return 500
    if (r == 'M'):
        return 1000
    return -1

def roman_to_decimal(numeral):
    """Convert Roman numerals to integers.
    
    Note:
    -----
    Code from https://www.geeksforgeeks.org/python-program-for-converting-roman-numerals-to-decimal-lying-between-1-to-3999/"""
    str = numeral.upper()
    res = 0
    i = 0

    while (i < len(str)):

        # Getting value of symbol s[i]
        s1 = roman_integer_value(str[i])
        
        # Return a negative number if error.
        if s1 < 0:
            return -1

        if (i + 1 < len(str)):

            # Getting value of symbol s[i + 1]
            s2 = roman_integer_value(str[i + 1])
            
            # Return a negative number if error.
            if s2 < 0:
                return -1

            # Comparing both values
            if (s1 >= s2):

                # Value of current symbol is greater
                # or equal to the next symbol
                res = res + s1
                i = i + 1
            else:

                # Value of current symbol is greater
                # or equal to the next symbol
                res = res + s2 - s1
                i = i + 2
        else:
            res = res + s1
            i = i + 1

    return res

def title_if_no_lowercase(string):
    """Changes to titlecase only if there are no lowercase letters in the string."""
    lower = 'abcdefghijklmnopqrstuvwxyz'
    is_lower = False
    for letter in string:
        if letter in lower:
            is_lower = True
    if is_lower:
        return string
    else:
        return string.title()

def fix_all_caps(name_pieces):
    """Input is a list of name strings from name split by spaces"""
    clean_pieces = []
    for piece in name_pieces:
        # Special handing for names starting with apostrophe-based prefixes
        apostrophe_list = ["van't", "'t", "O'", "D'", "d'", "N'"]
        apostrophe_prefix = ''
        for possible_apostrophe_prefix in apostrophe_list:
            if possible_apostrophe_prefix in piece:
                # Remove prefix
                piece = piece.replace(possible_apostrophe_prefix, '')
                apostrophe_prefix = possible_apostrophe_prefix
        
        # Special handling for name parts that are lowercase
        lower_case_list = ['von', 'de', 'van', 'la', 'der']
        if piece.lower() in lower_case_list:
            piece = piece.lower()
        else:
            # Special handling for hyphenated names; doesn't work for an edge case with more than 2 hyphens
            if '-' in piece:
                halves = piece.split('-')
                piece = title_if_no_lowercase(halves[0]) + '-' + title_if_no_lowercase(halves[1])
            else:
                piece = title_if_no_lowercase(piece)
        
        # put any apostrophe prefix back on the front
        if apostrophe_prefix:
            piece = apostrophe_prefix + piece
        
        clean_pieces.append(piece)
    return clean_pieces
  
def extract_name_pieces(name):
    """add description here"""
    # treat commas as if they were spaces
    name = name.replace(',', ' ')
    # get rid of periods, sometimes periods are close up with no spaces
    name = name.replace('.', ' ')

    pieces = name.split(' ')
    while '' in pieces:
        pieces.remove('')
    
    # Remove ", Jr.", "III", etc. from end of name
    if pieces[len(pieces)-1] == 'Jr':
        pieces = pieces[0:len(pieces)-1]
        suffix = ', Jr.'
    elif pieces[len(pieces)-1] == 'II':
        pieces = pieces[0:len(pieces)-1]
        suffix = ' II'
    elif pieces[len(pieces)-1] == 'III':
        pieces = pieces[0:len(pieces)-1]
        suffix = ' III'
    elif pieces[len(pieces)-1] == 'IV':
        pieces = pieces[0:len(pieces)-1]
        suffix = ' IV'
    elif pieces[len(pieces)-1] == 'V':
        pieces = pieces[0:len(pieces)-1]
        suffix = ' V'
    elif len(pieces) > 3 and pieces[len(pieces)-2] == 'the' and pieces[len(pieces)-1] == 'elder':
        pieces = pieces[0:len(pieces)-2]
        suffix = ' the elder'
    else:
        suffix = ''
        
    # Fix stupid situation where name is written in ALL CAPS
    pieces = fix_all_caps(pieces)
    return pieces, suffix
    
def extract_names_from_list(names_string):
    """Extract multiple authors from a character-separated list in a single string."""
    if names_string == '':
        return []
    
    names_list = names_string.split(names_separator)
    
    output_list = []
    # If names are last name first
    if name_part_separator:
        for name in names_list:
            pieces = name.split(name_part_separator)
            if len(pieces) == 1: # an error, name wasn't reversed
                print('Name error:', names_string)
            elif len(pieces) == 2: # no Jr.
                surname_pieces, suffix = extract_name_pieces(pieces[0].strip())
                given_pieces, dummy = extract_name_pieces(pieces[1].strip())
            elif len(pieces) == 3: # has Jr.
                # Note Jr. is handled inconsistently, sometimes placed after entire name, sometimes after surname
                if 'Jr' in pieces[2]:
                    surname_pieces, suffix = extract_name_pieces(pieces[0].strip() + ', ' + pieces[2].strip())
                    given_pieces, dummy = extract_name_pieces(pieces[1].strip())
                else:
                    surname_pieces, suffix = extract_name_pieces(pieces[0].strip() + ', ' + pieces[1].strip())
                    given_pieces, dummy = extract_name_pieces(pieces[2].strip())                    
            else:
                print('Name error:', names_string)
                
            surname = ' '.join(surname_pieces)
            given = ' '.join(given_pieces)
            output_list.append({'orcid': '', 'givenName': given, 'familyName': surname, 'suffix': suffix, 'affiliation': []})
    else:
        pass # need to write code for case where they aren't reversed
        
    
    return output_list

# ------------------------
# SPARQL query class
# ------------------------

# This is a condensed version of the more full-featured script at 
# https://github.com/HeardLibrary/digital-scholarship/blob/master/code/wikidata/sparqler.py
# It includes only the method for the query form.

class Sparqler:
    """Build SPARQL queries of various sorts

    Parameters
    -----------
    useragent : str
        Required if using the Wikidata Query Service, otherwise optional.
        Use the form: appname/v.v (URL; mailto:email@domain.com)
        See https://meta.wikimedia.org/wiki/User-Agent_policy
    endpoint: URL
        Defaults to Wikidata Query Service if not provided.
    method: str
        Possible values are "post" (default) or "get". Use "get" if read-only query endpoint.
        Must be "post" for update endpoint.
    sleep: float
        Number of seconds to wait between queries. Defaults to 0.1
        
    Required modules:
    -------------
    requests, datetime, time
    """
    def __init__(self, method='post', endpoint='https://query.wikidata.org/sparql', useragent=None, sleep=0.1):
        # attributes for all methods
        self.http_method = method
        self.endpoint = endpoint
        if useragent is None:
            if self.endpoint == 'https://query.wikidata.org/sparql':
                print('You must provide a value for the useragent argument when using the Wikidata Query Service.')
                print()
                raise KeyboardInterrupt # Use keyboard interrupt instead of sys.exit() because it works in Jupyter notebooks
        self.sleep = sleep

        self.requestheader = {}
        if useragent:
            self.requestheader['User-Agent'] = useragent
        
        if self.http_method == 'post':
            self.requestheader['Content-Type'] = 'application/x-www-form-urlencoded'

    def query(self, query_string, form='select', verbose=False, **kwargs):
        """Send a SPARQL query to the endpoint.
        
        Parameters
        ----------
        form : str
            The SPARQL query form.
            Possible values are: "select" (default), "ask", "construct", and "describe".
        mediatype: str
            The response media type (MIME type) of the query results.
            Some possible values for "select" and "ask" are: "application/sparql-results+json" (default) and "application/sparql-results+xml".
            Some possible values for "construct" and "describe" are: "text/turtle" (default) and "application/rdf+xml".
            See https://docs.aws.amazon.com/neptune/latest/userguide/sparql-media-type-support.html#sparql-serialization-formats-neptune-output
            for response serializations supported by Neptune.
        verbose: bool
            Prints status when True. Defaults to False.
        default: list of str
            The graphs to be merged to form the default graph. List items must be URIs in string form.
            If omitted, no graphs will be specified and default graph composition will be controlled by FROM clauses
            in the query itself. 
            See https://www.w3.org/TR/sparql11-query/#namedGraphs and https://www.w3.org/TR/sparql11-protocol/#dataset
            for details.
        named: list of str
            Graphs that may be specified by IRI in a query. List items must be URIs in string form.
            If omitted, named graphs will be specified by FROM NAMED clauses in the query itself.
            
        Returns
        -------
        If the form is "select" and mediatype is "application/json", a list of dictionaries containing the data.
        If the form is "ask" and mediatype is "application/json", a boolean is returned.
        If the mediatype is "application/json" and an error occurs, None is returned.
        For other forms and mediatypes, the raw output is returned.

        Notes
        -----
        To get UTF-8 text in the SPARQL queries to work properly, send URL-encoded text rather than raw text.
        That is done automatically by the requests module for GET. I guess it also does it for POST when the
        data are sent as a dict with the urlencoded header. 
        See SPARQL 1.1 protocol notes at https://www.w3.org/TR/sparql11-protocol/#query-operation        
        """
        query_form = form
        if 'mediatype' in kwargs:
            media_type = kwargs['mediatype']
        else:
            if query_form == 'construct' or query_form == 'describe':
            #if query_form == 'construct':
                media_type = 'text/turtle'
            else:
                media_type = 'application/sparql-results+json' # default for SELECT and ASK query forms
        self.requestheader['Accept'] = media_type
            
        # Build the payload dictionary (query and graph data) to be sent to the endpoint
        payload = {'query' : query_string}
        if 'default' in kwargs:
            payload['default-graph-uri'] = kwargs['default']
        
        if 'named' in kwargs:
            payload['named-graph-uri'] = kwargs['named']

        if verbose:
            print('querying SPARQL endpoint')

        start_time = datetime.now()
        if self.http_method == 'post':
            response = requests.post(self.endpoint, data=payload, headers=self.requestheader)
        else:
            response = requests.get(self.endpoint, params=payload, headers=self.requestheader)
        elapsed_time = (datetime.now() - start_time).total_seconds()
        self.response = response.text
        sleep(self.sleep) # Throttle as a courtesy to avoid hitting the endpoint too fast.

        if verbose:
            print('done retrieving data in', int(elapsed_time), 's')

        if query_form == 'construct' or query_form == 'describe':
            return response.text
        else:
            if media_type != 'application/sparql-results+json':
                return response.text
            else:
                try:
                    data = response.json()
                except:
                    return None # Returns no value if an error. 

                if query_form == 'select':
                    # Extract the values from the response JSON
                    results = data['results']['bindings']
                else:
                    results = data['boolean'] # True or False result from ASK query 
                return results           

# ------------------------
# mapping functions
# ------------------------

def identity(value):
    """Return the value argument with any leading and trailing whitespace removed."""
    return value.strip()

def set_instance_of(string):
    """Match the type string with possible types for the data source and return the type Q ID."""
    if string == '':
        return ''

    for work_type in work_types:
        if string == work_type['type_string']:
            return work_type['qid']

    print('Did not find datatype for type:', string)
    return ''

def set_description(string):
    """Match the type string with possible types for the data source and return the description."""
    if string == '':
        return ''
    
    for work_type in work_types:
        if string == work_type['type_string']:
            return work_type['description']

    print('Did not find datatype for type:', string)
    return ''

def detect_language(string):
    """Detect the language of the label."""
    if string == '':
        return ''
    try:
        lang_list = detect_langs(string)
        lang_string = str(lang_list[0])
        confidence = float(lang_string[3:])
        lang = lang_string[:2]
    except: #exceptions occur when no info to decide, e.g. numbers
        lang = 'zxx'
        confidence = float(0)
    if confidence < precision_cutoff:
        print('Warning: language confidence for', lang, 'below', precision_cutoff, ':', confidence)
    if lang in language_qid:
        return language_qid[lang]
    else:
        print('Warning: detected language', lang, 'not in list of known languages.')
        return ''

def title_en(string):
    """Detect the language of the label."""
    if string == '':
        return ''
    try:
        lang_list = detect_langs(string)
        lang_string = str(lang_list[0])
        confidence = float(lang_string[3:])
        lang = lang_string[:2]
    except: #exceptions occur when no info to decide, e.g. numbers
        lang = 'zxx'
        confidence = float(0)
    if lang == 'en':
        return string
    else:
        return ''

    
def calculate_pages(range):
    """Calculate the number of pages from the page range.
    
    Note
    ----
    Supports properly formatted Roman numerals and doesn't care about whitespace."""
    if range == '':
        return ''
    numbers = range.split('-')
    
    # If there is only a single number or an empty cell, return the empty string.
    if len(numbers) < 2:
        return ''
    # Edge case where it isn't a well-formed range and has multiple hyphens
    if len(numbers) > 2:
        return ''
    
    # Step through the two numbers to try to convert them from Roman numerals if not integers.
    for index, number in enumerate(numbers):
        number = number.strip()
        if not number.isnumeric():
            numbers[index] = roman_to_decimal(number)
            
            # Will return -1 error if it contains characters not valid for Roman numerals 
            if numbers[index] < 0:
                return ''
    
    number_pages = int(numbers[1]) - int(numbers[0]) + 1 # Need to add one since first page in range counts
    if number_pages < 1:
        return ''
    return str(number_pages)
    
    return value

def clean_doi(value):
    """Turn DOI into uppercase and remove leading and trailing whitespace."""
    cleaned_value = value.upper().strip()
    return cleaned_value

def disambiguate_published_in(value):
    """Use the value in the ISSN column to try to find the containing work.
    
    Note:
    -----
    For journal articles, this performs a legitimate WQS search for the journal title using the ISSN.
    For book chapters, the ISSN column may contain the Q ID of the containing book, inserted there during
    a pre-processing step (a hack, but typically books would not have an ISSN and this column would be empty)."""
    if value == '':
        return value
    
    # The value is a Q ID and was determined during a pre-processing step (i.e. for book chapters)
    if value[0] == 'Q':
        return value

    # Look up the ISSN from CrossRef in Wikidata
    # Build query string
    query_string = '''select distinct ?container ?containerLabel where {
      ?container wdt:P236 "''' + value + '''".
      optional {
      ?container rdfs:label ?containerLabel.
      filter(lang(?containerLabel)="''' + default_language + '''")
      }
    }'''
    #print(query_string)

    wdqs = Sparqler(useragent=user_agent)
    query_results = wdqs.query(query_string)
    sleep(sparql_sleep)
    
    if len(query_results) == 0:
        return ''

    # !!!!!!!!!!!!!!!!!! Enable this code when the error log is set up
    """
    if len(query_results) > 1:
        print('Warning! More than one container in Wikidata matched the ISSN ', file=log_object)
        print(query_results, '\n', file=log_object)
    """
    # Extract Q ID from SPARQL query results. If there is more than one result, the last one will be used for the Q ID
    for result in query_results:
        container_qid = extract_local_name(result['container']['value'])
        # Skipping this since container name isn't passed into the function.
        """
        journal_name = result['containerLabel']['value']
        if journal_name != crossref_results['journal_title']:
            # NOTE: did empirical testing to see which kind of fuzzy matching worked best
            #ratio = fuzz.ratio(journal_name, crossref_results['journal_title'])
            #partial_ratio = fuzz.partial_ratio(journal_name, crossref_results['journal_title'])
            #sort_ratio = fuzz.token_sort_ratio(journal_name, crossref_results['journal_title'])
            #set_ratio = fuzz.token_set_ratio(journal_name, crossref_results['journal_title'])
            w_ratio = fuzz.WRatio(journal_name, crossref_results['journal_title'])
            #print('name similarity ratio', ratio)
            #print('partial ratio', partial_ratio)
            #print('sort_ratio', sort_ratio)
            #print('set_ratio', set_ratio)
            if w_ratio < 99:
                print('article:', crossref_results['label_' + default_language], 'w_ratio:', w_ratio, 'Warning: Wikidata journal: "' + journal_name + '"', journal_qid, 'does not match CrossRef journal title: "' + crossref_results['journal_title'] + '"\n', file=log_object)
        #print('article:', crossref_results['label_' + default_language], 'journal:', journal_qid, journal_name)
        """
    return container_qid

def isbn10(string):
    """Check whether the ISBN value has 10 characters or not."""
    test = string.replace('-', '')
    if len(test) == 10:
        return string
    return ''

def isbn13(string):
    """Check whether the ISBN value has 13 characters or not."""
    test = string.replace('-', '')
    if len(test) == 13:
        return string
    return ''

def disambiguate_publisher(name_string):
    """Look up the publisher Q ID from a list derived from a SPARQL query https://w.wiki/4pbi"""
    # Set publisher Q ID to empty string if there's no publisher string
    if name_string == '':
        return ''
    
    best_match_score = 0
    best_match = ''
    best_match_label = ''
    for qid, publisher in publishers.iterrows():  # The publishers DataFrame is a global variable
        w_ratio = fuzz.WRatio(name_string, publisher['label'])
        if w_ratio > best_match_score:
            best_match = qid
            best_match_label = publisher['label']
            best_match_score = w_ratio
            
    if best_match_score < 98:
        print('w_ratio:', best_match_score, 'Warning: poor match of: "' + best_match_label + '"', best_match, 'to stated publisher: "' + name_string + '"\n')
        #print('w_ratio:', best_match_score, 'Warning: poor match of: "' + best_match_label + '"', best_match, 'to stated publisher: "' + name_string + '"\n', file=log_object)
    return best_match

def disambiguate_place_of_publication(value):
    """Look up place of publication Q ID from a list derived from query https://w.wiki/63Ap
    If there is a single match, the Q ID is returned.
    If there are no matches, the string is returned unprocessed.
    If there are multiple matches, a dict with possible values is returned."""
    if value == '':
        return ''
    
    if 'New York' in value:
        return 'Q60'
    
    if 'New Brunswick' in value:
        return 'Q138338'
    
    if 'California' in value:
        value = value.replace('California', 'CA')
    
    if 'Calif' in value:
        value = value.replace('Calif', 'CA')
        
    if 'Massachusetts' in value:
        value = value.replace('Massachusetts', 'MA')
        
    if 'Cambridge' in value:
        if 'Cambridge, M' in value:
            return 'Q49111'
        else:
            return 'Q350'
    
    location_list = []
    for qid, location in publisher_locations.iterrows():  # The publisher_locations DataFrame is a global variable
        if location['label'] in value:
            location_list.append({'qid': qid, 'label': location['label']})
    if len(location_list) == 0:
        return value
    elif len(location_list) == 1:
        return location_list[0]['qid']
    else:
        return location_list
    
    return value

def today():
    """Generate the current UTC xsd:date"""
    whole_time_string_z = datetime.utcnow().isoformat() # form: 2019-12-05T15:35:04.959311
    date_z = whole_time_string_z.split('T')[0] # form 2019-12-05
    return date_z

def set_reference(input_url):
    """Set any URL that is present in the field as the reference URL value."""
    url = include_reference_url(input_url) # Screen for suitable URLs
    if url != '':
        return url
    else:
        return ''

def set_stated_in(input_url):
    """If no URL is present, set a fixed value to be used as the stated_in value."""
    url = include_reference_url(input_url) # Screen for suitable URLs
    if url == '':
        return 'Q114403967' # Vanderbilt Divinity publications database
    else:
        return ''

# ---------------------------
# Major processes functions
# ---------------------------

def build_function(function_name_string, passed_value_string):
    if len(passed_value_string) == 0:
        expression = function_name_string + "('')"
    else:
        # Hack for cases where the data string is enclosed in single quotes
        if passed_value_string[0] == "'" and passed_value_string[-1] == "'":
            expression = function_name_string + '("""' + passed_value_string + '""")'
        else:
            expression = function_name_string + "('''" + passed_value_string + "''')"
    output_value = eval(expression)
    return output_value

def extract_metadata(mapping, work_data):
    """Steps through fields described in the config file, maps them to columns in the source data, and
    uses processing functions to transform the input data to forms required in the output table.
    
    Parameters
    ----------
    mapping : complex structure
        Maps column headers ("variable") in the destination table to column headers ("source") in the source table.
        The "value" key indicates the function used to determine the value to be used in the destination table.
    work_data : dict
        A row of data from the source data table with column headers as the keys.
    """
    out_dict = {'qid': ''}

    for out_property in config['outfiles'][0]['prop_list']:
        
        # Find the mapping variable that matches the config property
        for prop in mapping['properties']:
            if prop['variable'] == out_property['variable']:
                break
    
        out_field = out_property['variable']
        out_dict[out_field + '_uuid'] = ''
        
        # The mapping function may not require an argument. In that case, there's no source column.
        if 'source' in prop:
            # If the source data CSV doesn't have any column named according to mappings, the output for that
            # variable is an empty string.
            if prop['source'] in work_data:
                output_value = build_function(prop['value'], work_data[prop['source']])
                if output_value == '':
                    no_value = True
                else:
                    no_value = False
            else:
                output_value = ''
                no_value = True
        # Case where there's no argument passed to mapping function
        else:
            expression = prop['value'] + '()'
            output_value = eval(expression)
            if output_value == '':
                no_value = True
            else:
                no_value = False

        # Populate the values-related columns
        if out_property['value_type'] == 'date':
            out_dict[out_field + '_nodeId'] = ''
            out_dict[out_field + '_val'] = output_value
            out_dict[out_field + '_prec'] = ''

        elif out_property['value_type'] == 'quantity':
            out_dict[out_field + '_nodeId'] = ''
            out_dict[out_field + '_val'] = output_value
            if no_value:
                out_dict[out_field + '_unit'] = ''
            else:
                out_dict[out_field + '_unit'] = prop['quantity_unit']

        # This is not actually implemented and will generate an error if used
        elif out_property['value_type'] == 'globecoordinate':
            out_dict[out_field + '_nodeId'] = ''
            out_dict[out_field + '_val'] = output_value
            if no_value:
                out_dict[out_field + '_long'] = ''
                out_dict[out_field + '_prec'] = ''
            else:
                out_dict[out_field + '_long'] = work_data[out_field + '_long']
                out_dict[out_field + '_prec'] = work_data[out_field + '_prec']

        else:
            out_dict[out_field] = output_value

        # Populate the qualifier columns
        for qualifier in out_property['qual']:
            if no_value:
                no_qual_value = True
            else:
                # Find the mapping variable that matches the config property
                for qual in prop['qual']:
                    if qual['variable'] == qualifier['variable']:
                        break

                # Skip reading a value from a source column if the function doesn't need input.
                if 'source' in qual:
                    expression = qual['value'] + "('''" + work_data[qual['source']] + "''')"
                    qual_output_value = eval(expression)
                    if qual_output_value == '':
                        no_qual_value = True
                    else:
                        no_qual_value = False
                else:
                    no_qual_value = False
                    expression = qual['value'] + '()'
                    qual_output_value = eval(expression) # If this evalutes as empty string, result is same as no_ref_value = True


            qual_field = out_field + '_' + qualifier['variable']
            # To my knowledge, dates are the only complex types used as qualifiers (no quantities or globecoordinates).
            if qualifier['value_type'] == 'date':
                out_dict[qual_field + '_nodeId'] = ''
                if no_qual_value:
                    out_dict[qual_field + '_val'] = ''
                else:
                    out_dict[qual_field + '_val'] = qual_output_value
                out_dict[qual_field + '_prec'] = ''
            else:
                if no_qual_value:
                    out_dict[qual_field] = ''
                else:
                    out_dict[qual_field] = qual_output_value
                
        # Populate the reference columns
        # There's only a hash ID column if there's at least one reference.
        if len(out_property['ref']) > 0:
            out_dict[out_field + '_ref1_hash'] = ''
            
        for reference in out_property['ref']:
            if no_value:
                no_ref_value = True
            else:
                # Find the mapping variable that matches the config property
                for ref in prop['ref']:
                    if ref['variable'] == reference['variable']:
                        break

                # Some functions like today() don't need input from the source table, and therefore 
                # skip reading a value from a source column.
                if 'source' in ref:
                    expression = ref['value'] + "('''" + work_data[ref['source']] + "''')"
                    ref_output_value = eval(expression)
                    if ref_output_value == '':
                        no_ref_value = True
                    else:
                        no_ref_value = False
                else:
                    no_ref_value = False
                    expression = ref['value'] + '()'
                    ref_output_value = eval(expression) # If this evalutes as empty string, result is same as no_ref_value = True

            ref_field = out_field + '_ref1_' + reference['variable']
            # To my knowledge, dates are the only complex types used as qualifiers (no quantities or globecoordinates).
            if reference['value_type'] == 'date':
                out_dict[ref_field + '_nodeId'] = ''
                if no_ref_value:
                    out_dict[ref_field + '_val'] = ''
                else:
                    out_dict[ref_field + '_val'] = ref_output_value
                out_dict[ref_field + '_prec'] = ''
            else:
                if no_ref_value:
                    out_dict[ref_field] = ''
                else:
                    out_dict[ref_field] = ref_output_value
                    
    #print(out_dict)
    return out_dict



# Preprocessing

Idiosyncratic steps that need to be done between the Zotero output and running the "standardized" script.


In [None]:
# This step involves re-setting the Url column to use the screened URLs if the Zotero output title matches
# the title in the screened full work CSV.

source_data = 'output_examples_div_pubs.csv'
#source_data = 'output-example_baldwinbookschaptersarticles.csv'
works = pd.read_csv(source_data, na_filter=False, dtype = str)
#works = works.iloc[180:225] # test for full text URL substitutions
#works = works.iloc[159:164]
#works = works.iloc[2033:2052] # test of ref screening
#works = works.iloc[1970:1988] # good rows to test for languages
works = works.iloc[:5]

for label, work_series in works.iterrows():
    try:
        # Find the row(s) in the full_works DataFrame that matches the series. There should be only one.
        # Create a series of URL values for those rows. Since there should be only one, get the 0th value.
        new_url = full_works.loc[full_works.index==work_series['Title'], 'Url'][0]
        # Set a new value for the Url column in the works DataFrame using the looked-up URL.
        works.loc[label, 'Url'] = new_url
        
    except:
        pass
    
works.to_csv('preprocessed.csv', index = False)
print('done')

# Main routine

NOTE: Before continuing on after this step, you need to correct any of the publication locations that weren't determined.

In [None]:
publishers = pd.read_csv('publishers.csv', na_filter=False, dtype = str)
publishers.set_index('qid', inplace=True)

publisher_locations = pd.read_csv('publisher_locations.csv', na_filter=False, dtype = str)
publisher_locations.set_index('qid', inplace=True)

source_data = 'preprocessed.csv'
works = pd.read_csv(source_data, na_filter=False, dtype = str)

with open('config.yaml', 'r') as file_object:
    config = yaml.safe_load(file_object)

with open('mapping.yaml', 'r') as file_object:
    mapping = yaml.safe_load(file_object)

with open('mapping_agents.yaml', 'r') as file_object:
    mapping_agents = yaml.safe_load(file_object)


In [None]:
works_list = []
agents_list = []
for index, work_data in works.iterrows():
    print(work_data['Title'])
    locally_unique_id = work_data['Key']
    
    # Use the mappings to extract and process the main metadata from the source columns
    row = extract_metadata(mapping, work_data)
    works_list.append(row)
    
    agents_dict = {'id': locally_unique_id}
    # For each agent type (author, editor, etc.) extract the name information
    for agent_type in mapping_agents['sources']:
        source_column = agent_type['source']
        agent_structured_data = build_function(agent_type['value'], work_data[source_column])
        agents_dict[agent_type['variable']] = json.dumps(agent_structured_data)
    agents_list.append(agents_dict)

out_frame = pd.DataFrame(works_list)
agents_frame = pd.DataFrame(agents_list)

out_frame.to_csv('test_works_to_write.csv', index = False)
agents_frame.to_csv('stored_retrieved_authors.csv', index = False)
print('done')


In [None]:
authors_string = 'Floyd-Thomas, Stacey M.; De La Torre, Miguel A.'
#authors_string = 'Armour, Ellen; Garland-Thomson, Rosemarie'
#authors_string = 'Darity, William A., Jr.'
#authors_string = 'Resner, Jr., André'
#authors_string = 'Williams IV, Joe'
authors_string = ''
extract_names_from_list(authors_string)

In [None]:

for index, work_data in works.iterrows():
    url = work_data['Place']
    if url != '':
        print(disambiguate_place_of_publication(url))
