# Script to create Wikidata items for ACT artwork

Initially, this will be for works that are already in Commons, but for which the links to Commons are from non-artwork Wikidata items. Eventually, we can modify this for any work that's in Commons and doesn't have a Wikidata item.

## Properties whose values need to be cleaned/generated

Taken from [here](https://github.com/HeardLibrary/vandycite/blob/master/act/processed_lists/candidate_properties_to_write.csv).

P571 (inception): get from ACT "DateCreation" and Commons "date" fields

## Configuration section

Run once at the start


In [None]:
# (c) 2022 Vanderbilt University. This program is released under a GNU General Public License v3.0 http://www.gnu.org/licenses/gpl-3.0
# Author: Steve Baskauf

# Import modules
import json
import csv
import math
import datetime
from time import sleep
import requests
import re # regex
# Pandas for data frame management
import pandas as pd
# Fuzzy string matching
from fuzzywuzzy import fuzz # fuzzy logic matching

accept_media_type = 'application/json'
endpoint = 'https://query.wikidata.org/sparql'
user_agent_header = 'act_disambiguation/0.1 (mailto:steve.baskauf@vanderbilt.edu)'
sparql_sleep = 0.1
default_language = 'en'
output_directory = 'create_missing_artwork_items/'

# Load data from the two sources (ACT database dump and Commons Mediawiki table scrape)
act_data = pd.read_csv('act_data_fix.csv', na_filter=False, dtype = str)
commons_data = pd.read_csv('commons_data_fix.csv', na_filter=False, dtype = str)
ids = pd.read_csv('clean_ids.csv', na_filter=False, dtype = str)
country_mappings = pd.read_csv('country_mappings.csv', na_filter=False, dtype = str)
collections_mappings = pd.read_csv('collections.csv', na_filter=False, dtype = str)

# For testing purposes, just use the first few rows
test_rows = 10
#act_data = act_data.head(test_rows).copy()
#commons_data = commons_data.head(test_rows).copy()

# --------------------
# Low-level functions
# --------------------

def generate_utc_date():
    whole_time_string_z = datetime.datetime.utcnow().isoformat() # form: 2019-12-05T15:35:04.959311
    date_z = whole_time_string_z.split('T')[0] # form 2019-12-05
    return date_z

today = generate_utc_date()

# Load JSON file data from local drive into a Python data structure
def load_json_into_data_struct(path):
    with open(path, 'rt', encoding='utf-8') as file_object:
        file_text = file_object.read()
    structure = json.loads(file_text)
    # uncomment the following line to view the data
    # print(json.loads(structure, indent = 2))
    return(structure)

# screens.json is a configuration file that defines the kinds of screens to be performed on potential Q ID matches from Wikidata
screens = load_json_into_data_struct('screens.json')

# Read from a CSV file on disk into a list of dictionaries (representing a table)
def read_dicts_from_csv(filename):
    with open(filename, 'r', newline='', encoding='utf-8') as file_object:
        dict_object = csv.DictReader(file_object)
        table = []
        for row in dict_object:
            table.append(row)
    return table

# write a list of dictionaries to a CSV file
def write_dicts_to_csv(table, filename, fieldnames):
    with open(filename, 'w', newline='', encoding='utf-8') as csv_file_object:
        writer = csv.DictWriter(csv_file_object, fieldnames=fieldnames)
        writer.writeheader()
        for row in table:
            writer.writerow(row)

def pad_zeros_left(date_string):
    length = len(date_string)
    pad = 4-length
    return '0' * pad + date_string

def generate_date_string(date, bce):
    if bce:
        date_string = '-'
    else:
        date_string = ''
    date_string += pad_zeros_left(str(date)) + '-01-01T00:00:00Z'
    return date_string

# Extracts the local name part of an IRI, e.g. a qNumber from a Wikidata IRI
def extract_local_name(iri):
    # pattern is http://www.wikidata.org/entity/Q6386232
    pieces = iri.split('/')
    last_piece = len(pieces)
    return pieces[last_piece - 1]

# NOTE: there are still some issues that have not been worked out with quotation marks in query strings.
# Still working on this; see also the send_sparql_query() below.
def generate_sparql_header_dictionary(accept_media_type,user_agent_header):
    request_header_dictionary = {
        'Accept' : accept_media_type,
#        'Content-Type': 'application/sparql-query',
        'Content-Type': 'application/x-www-form-urlencoded',
        'User-Agent': user_agent_header
    }
    return request_header_dictionary

# The following function requires the request header generated above
sparql_request_header = generate_sparql_header_dictionary(accept_media_type, user_agent_header)

# ----------------------------
# Intermediate-level functions
# ----------------------------

# Parse the ACT date string into structured components
def process_act_date(act_date):
    act_circa = False
    act_range = False
    act_century = False
    non_numeric = False
    date = 0
    start_date = 0
    end_date = 0
    
    # If there is no date from ACT, kill the function and return False
    if act_date == '':
        return False, date, act_range, start_date, end_date, act_century, act_circa
    
    # Determine circa status of ACT date
    if 'ca.' in act_date:
        act_circa = True
        # Remove the "ca." from the beginning and clean whitespace
        act_date = act_date.split('ca.')[1].strip()
    
    # Test whether the ACT date is a number
    try:
        date = int(act_date)
        #print('numeric date:', date)
    except:
        non_numeric = True
        #print('non-numeric string:', act_date)
        
    if non_numeric:
        # Determine century status of ACT date
        if 'century' in act_date: # single century date
            act_century = True
            # Remove the "century" and "th", "rd", "st", etc. from the end
            act_date = act_date[:-10]
            non_numeric = False
            try:
                date = int(act_date) * 100 - 50 # set the date at mid-century
            except:
                print('numeric conversion error on', act_date)
        elif 'centuries' in act_date:
            act_century = True
            act_range = True
            # Remove the "centuries" and "th", "rd", "st", etc. from the end
            act_date = act_date[:-10].strip()
            try:
                pieces = act_date.split('-')
                start_date = int(pieces[0][:-2]) * 100 - 50 # set the date at mid-century
                end_date = int(pieces[1][:-2]) * 100 - 50 # set the date at mid-century
            except:
                print('error in processing century range')
    # Process date ranges (non-numeric because they include "-")
    if non_numeric and not act_century:
        #print(act_date)
        try:
            pieces = act_date.split('-')
            start_date = int(pieces[0])
            end_date = int(pieces[1])
            act_range = True
        except:
            print('error in processing date range')

        
    # if there is a range of dates, set the single date as the midpoint
    if start_date != 0 or end_date != 0:
        date = math.floor((start_date + end_date)/2)
            
    return True, date, act_range, start_date, end_date, act_century, act_circa
    
# Disassemble Wikibase-style dateTime strings into year, precision, and BCE components
def extract_from_iso_date_string(string):
    pieces = string.split('/')
    # precision comes after the slash in the Wikibase format
    precision = pieces[1]
    # check for negative sign for BCE dates
    if pieces[0][0] == '-':
        bce = True
    else:
        bce = False
    no_sign_dateTime = pieces[0][1:] # skip sign
    pieces = no_sign_dateTime.split('-')
    year = pieces[0]
    return int(year), precision, bce

# Parse any structured date data that was scraped from the Commons Mediawiki table
def process_commons_date(commons_date_string):
    # Set all values to defaults to return something even if they aren't determined from the data
    commons_circa = False
    commons_range = False
    date = 0
    precision = '9'
    bce = False
    start_date = 0
    start_precision = '9'
    start_bce = False
    end_date = 0
    end_precision = '9'
    end_bce = False

    commons_date_list = json.loads(commons_date_string)
    found = False
    for string in commons_date_list:
        # Find the part of the extracted metadata that includes the structured data
        if 'date QS' in string:
            found = True
            pieces = string.split(',') # split into fields by comma
            pieces = pieces[1:] # get rid of initial "inception field"
            date, precision, bce = extract_from_iso_date_string(pieces[0])
            # Remove the initial date from the list
            pieces = pieces[1:]
            
            # Check for circa
            if len(pieces) >= 2:
                # Check if last piece is "circa"
                if pieces[len(pieces)-1] == 'Q5727902':
                    commons_circa = True
                    # Remove the last two items from the list
                    pieces = pieces[:-2]
            #print(commons_circa, pieces)
            
            # Extract start date (if any)
            if len(pieces) > 0 and (pieces[0] == 'P1319' or pieces[0] == 'P580'): # check for earliest date or start time
                commons_range = True
                start_date, start_precision, start_bce = extract_from_iso_date_string(pieces[1]) # start date follows the P ID
                # Remove the first two pieces
                if len(pieces) > 0:
                    pieces = pieces[2:]
            
            if len(pieces) > 0 and (pieces[0] == 'P1326' or pieces[0] == 'P582'): # check for latest date or end time
                commons_range = True
                end_date, end_precision, end_bce = extract_from_iso_date_string(pieces[1]) # start date follows the P ID
    return found, date, precision, bce, commons_range, start_date, start_precision, start_bce, end_date, end_precision, end_bce, commons_circa
    
def retrieve_date_wikidata(qid, pid):
    query_string = '''
select distinct ?object where {
    wd:'''+ qid + ''' wdt:''' + pid + ''' ?object.
    }'''
    #print(query_string)
    r = requests.post(endpoint, data=dict(query=query_string), headers=sparql_request_header)
    no_response = True
    while no_response:
        try:
            r = requests.post(endpoint, data=dict(query=query_string), headers=sparql_request_header)
            no_response = False
        except:
            print('Query service error. Waiting 1 minute.')
            sleep(60)

    
    results_list = []
    try:
        data = r.json()
        #print(data)
        statements = data['results']['bindings']
        if len(statements) > 0: # if no results, the list remains empty
            for statement in statements:
                result_value = statement['object']['value']
                results_list.append(result_value)
    except:
        results_list = [r.text]

    # delay by some amount to avoid hitting the SPARQL endpoint too rapidly
    sleep(sparql_sleep)
    return results_list

# Sends a query to the query service endpoint. 
# NOTE: request_header and endpoint are global variables defined earlier in the script
def send_sparql_query(query_string):
    # You can delete the two print statements if the queries are short. However, for large/long queries,
    # it's good to let the user know what's going on.
    #print('querying SPARQL endpoint to acquire item metadata')
    #response = requests.post(endpoint, data=query_string.encode('utf-8'), headers=sparql_request_header)
    no_response = True
    while no_response:
        try:
            response = requests.post(endpoint, data=dict(query=query_string), headers=sparql_request_header)
            no_response = False
        except:
            print('Query service error. Waiting 1 minute.')
            sleep(60)
        
    #print(response.text) # uncomment to view the raw response, e.g. if you are getting an error
    data = response.json()

    # Extract the values from the response JSON
    results = data['results']['bindings']
    
    #print('done retrieving data')
    # print(json.dumps(results, indent=2))
    
    sleep(sparql_sleep) # delay to avoid hitting the Query Service too fast
    return results

def generate_name_alternatives(name_list):
    alternatives = []

    # This is a hack of the previous script to allow for checking more than one name alternative
    for name in name_list:
        # treat commas as if they were spaces
        name = name.replace(',', ' ')
        # get rid of periods, sometimes periods are close up with no spaces
        name = name.replace('.', ' ')

        pieces = name.split(' ')
        while '' in pieces:
            pieces.remove('')

        # Remove ", Jr.", "III", etc. from end of name
        if pieces[len(pieces)-1] == 'Jr':
            pieces = pieces[0:len(pieces)-1]
            suffix = ', Jr.'
        elif pieces[len(pieces)-1] == 'II':
            pieces = pieces[0:len(pieces)-1]
            suffix = ' II'
        elif pieces[len(pieces)-1] == 'III':
            pieces = pieces[0:len(pieces)-1]
            suffix = ' III'
        elif pieces[len(pieces)-1] == 'IV':
            pieces = pieces[0:len(pieces)-1]
            suffix = ' IV'
        elif pieces[len(pieces)-1] == 'V':
            pieces = pieces[0:len(pieces)-1]
            suffix = ' V'
        elif len(pieces) > 3 and pieces[len(pieces)-2] == 'the' and pieces[len(pieces)-1] == 'elder':
            pieces = pieces[0:len(pieces)-2]
            suffix = ' the elder'
        else:
            suffix = ''

        # generate initials for all names
        initials = []
        for piece in pieces:
            # make sure first character is alphabetic
            # only fixes the case where there is one alphanumeric, but more than one is rare
            # typical cases are like (Kit) or "Kit"
            if not piece[0:1].isalpha():
                piece = piece[1:len(piece)] # remove the first non-alphabetic character
            if len(piece) > 0:
                initials.append(piece[0:1])

        # full name
        name_version = ''
        for piece_number in range(0, len(pieces)-1):
            name_version += pieces[piece_number] + ' '
        name_version += pieces[len(pieces)-1]
        alternatives.append(name_version)

        # full name with suffix
        if suffix != '':
            name_version = ''
            for piece_number in range(0, len(pieces)-1):
                name_version += pieces[piece_number] + ' '
            name_version += pieces[len(pieces)-1] + suffix
            alternatives.append(name_version)

        # first and last name with initials
        name_version = pieces[0] + ' '
        for piece_number in range(1, len(initials)-1):
            name_version += initials[piece_number] + ' '
        name_version += pieces[len(pieces)-1]
        alternatives.append(name_version)

        # first and last name with initials and periods
        name_version = pieces[0] + ' '
        for piece_number in range(1, len(initials)-1):
            name_version += initials[piece_number] + '. '
        name_version += pieces[len(pieces)-1]
        alternatives.append(name_version)

        # first and last name only
        name_version = pieces[0] + ' '
        name_version += pieces[len(pieces)-1]
        alternatives.append(name_version)

        # first initial and last name only
        name_version = initials[0] + ' '
        name_version += pieces[len(pieces)-1]
        alternatives.append(name_version)

        # first initial with period and last name only
        name_version = initials[0] + '. '
        name_version += pieces[len(pieces)-1]
        alternatives.append(name_version)

        # all name initials with last name
        name_version = initials[0] + ' '
        for piece_number in range(1, len(initials)-1):
            name_version += initials[piece_number] + ' '
        name_version += pieces[len(pieces)-1]
        alternatives.append(name_version)

        # all name initials with periods with last name
        name_version = ''
        for piece_number in range(0, len(initials)-1):
            name_version += initials[piece_number] + '. '
        name_version += pieces[len(pieces)-1]
        alternatives.append(name_version)

        # all name initials concatenated with last name
        name_version = ''
        for piece_number in range(0, len(initials)-1):
            name_version += initials[piece_number]
        name_version += ' ' + pieces[len(pieces)-1]
        alternatives.append(name_version)
    
    # remove duplicates
    dedupe = list(set(alternatives))

    return dedupe

def search_name_at_wikidata(name_list):
    # carry out search for most languages that use Latin characters, plus some other commonly used languages
    # See https://doi.org/10.1145/3233391.3233965
    language_codes = [
        'en',
        'es',
        'pt',
        'fr',
        'it',
        'nl',
        'de',
        'da',
        'et',
        'hu',
        'ga',
        'ro',
        'sk',
        'sl',
        'zu',
        'tr',
        'sv',
        'zh',
        'ru',
        'ja',
        'ar',
        'pl',
        'uk',
        'ca',
        'cs',
        'la',
        'nb',
        'hu',
        'he',
        'eo',
        'fi',
        'ko'
      ]
    name_list = generate_name_alternatives(name_list)
    alternatives = ''
    for language_code in language_codes:
        for alternative in name_list:
            # get rid of quotes, which will break the query
            alternative = alternative.replace('"', '')
            alternative = alternative.replace("'", '')
            alternatives += '"' + alternative + '"@' + language_code + '\n'
    query = '''
select distinct ?item ?label where {
  VALUES ?value
  {
  ''' + alternatives + '''}
?item rdfs:label|skos:altLabel ?value.
?item rdfs:label ?label.
FILTER(lang(?label)='en')
  }
'''
    #print(query)
    #print('searching for ', name)
    results = []
#    r = requests.post(endpoint, data=query.encode('utf-8'), headers=sparql_request_header)
    no_response = True
    while no_response:
        try:
            r = requests.post(endpoint, data=dict(query=query), headers=sparql_request_header)
            no_response = False
        except:
            print('Query service error. Waiting 1 minute.')
            sleep(60)

    try:
        data = r.json()
        #print(data)
        statements = data['results']['bindings']
        for statement in statements:
            wikidata_iri = statement['item']['value']
            if 'label' in statement:
                name = statement['label']['value']
            else:
                name = ''
            qnumber = extract_local_name(wikidata_iri)
            results.append({'qid': qnumber, 'name': name})
    except:
        results = [{'error': r.text}]
    # delay a quarter second to avoid hitting the SPARQL endpoint too rapidly
    sleep(sparql_sleep)
    return results

def screen_qids(qids, screens):
    qid_values =''
    for qid in qids:
        qid_values += 'wd:' + qid + '\n'

    graph_pattern = ''
    first_screen = True
    for screen in screens:
        # Each requirement in a screen has an AND relationship (all must be satisfied)
        subgraph_pattern = ''
        for requirement in screen:

            # Set the value if required or use a dummy variable if any value is allowed
            if requirement['entity'] == '':
                value = '?var' + requirement['property'] # add the property string to the variable to guarantee uniqueness
            elif re.match(r'Q\d+', requirement['entity']): # regex to match Q IDs
                value = 'wd:' + requirement['entity']
            else: # if not nothing or a Q ID, assume it's a string literal
                if requirement['lang'] == '':
                    value = '"' + requirement['entity'] + '"'
                else:
                    value = '"' + requirement['entity'] + '"@' + requirement['lang']

            # Set the property (label, description, or P value)
            if requirement['property'] == 'label':
                property = 'rdfs:label'
            elif requirement['property'] == 'description':
                property = 'schema:description'
            else:
                property = 'wdt:' + requirement['property']

            # Place the value in either the subject or object position in the triple
            if requirement['position'] == 'object':
                triple_pattern = '?qid ' + property + ' ' + value + '.'
            else:
                triple_pattern = value + ' ' + property + ' ?qid.'

            # Add filters if needed
            if requirement['filter_type'] == '<' or requirement['filter_type'] == '>': 
                # note: string comparison only e.g. for datetimes, needs modification for actual numbers
                triple_pattern += '\nFILTER (STR(?var' + requirement['property'] + ') ' + requirement['filter_type'] + ' "' + requirement['filter_string'] + '")'

            if requirement['filter_type'] == 'in': 
                # note: string comparison only
                triple_pattern += '\nFILTER (CONTAINS(?var' + requirement['property'] + ', "' + requirement['filter_string'] + '"))'

            # Use MINUS if you want to exclude items that fit the pattern.
            if requirement['require'] == 'exclude':
                triple_pattern = 'minus {' + triple_pattern + '}'

            triple_pattern += '\n'
            #print(triple_pattern)
            subgraph_pattern += triple_pattern

        # Now attach the subgraph pattern to any previous subgraph patterns using UNION to great an OR relationship
        subgraph_pattern = '{\n' + subgraph_pattern + '}\n' # create a subgraph pattern so that several can be UNIONed
        if first_screen: # The first subgraph pattern doesn't need the UNION inserted
            first_screen = False
        else:
            graph_pattern = graph_pattern + 'UNION\n'
        graph_pattern += subgraph_pattern 

    query_string = '''
    select distinct ?qid ?label ?description where {
      VALUES ?qid
      {
      ''' + qid_values + '''}
    ''' + graph_pattern + '''
    
    ?qid rdfs:label ?label.
    FILTER(lang(?label)="'''+ default_language + '''")
    
    OPTIONAL {
    ?qid schema:description ?description.
    FILTER(lang(?description)="'''+ default_language + '''")
    }
      }
    '''
    #print(query_string)

    results = send_sparql_query(query_string)
    #print(json.dumps(results, indent=2))

    return_list = []
    for result in results:
        out_dict = {
            'qid': extract_local_name(result['qid']['value']),
            'label': result['label']['value']
            }
        if 'description' in result:
            out_dict['description'] = result['description']['value']
        else:
            out_dict['description'] = ''           
        return_list.append(out_dict)
    return return_list

# -------------------
# Top level functions
# -------------------
    
def process_dates(act_id, output_dict, issue_log, act_url, commons_url, act_date_string, commons_date_string):
    # Hard code ACT BCE to False, at least until it's determined whether any ACT dates
    # are designated as BCE or have negative signs.
    act_bce = False
    
    act_found, act_date, act_range, act_start_date, act_end_date, act_century, act_circa  = process_act_date(act_date_string)
    #print(act_date, act_start_date, act_end_date)
    
    commons_found, commons_date, commons_precision, commons_bce, commons_range, commons_start_date, commons_start_precision, commons_start_bce, commons_end_date, commons_end_precision, commons_end_bce, commons_circa = process_commons_date(commons_date_string)
    #print(commons_date, commons_start_date, commons_end_date)
    
    # Perform quality control and determine output values
    if not act_found and not commons_found:
        issue_log += act_id + ' | ' + filename + ' | no dates retrieved. Commons data: ' + commons_date_string + '\n'
        output_dict['inception_ref1_referenceUrl'] = ''
        output_dict['inception_ref1_retrieved_val'] = ''
        output_dict['inception_val'] = ''
        output_dict['inception_sourcing_circumstances'] = ''
        output_dict['inception_prec'] = ''
        output_dict['inception_earliest_date_val'] = ''
        output_dict['inception_earliest_date_prec'] = ''
        output_dict['inception_latest_date_val'] = ''
        output_dict['inception_latest_date_prec'] = ''
        
    if act_found and not commons_found:
        output_dict['inception_ref1_referenceUrl'] = act_url
        output_dict['inception_ref1_retrieved_val'] = today
        output_dict['inception_val'] = generate_date_string(act_date, act_bce)
        if act_circa:
            output_dict['inception_sourcing_circumstances'] = 'Q5727902'
        else:
            output_dict['inception_sourcing_circumstances'] = ''
        if act_century:
            output_dict['inception_prec'] = '7'
        else:
            output_dict['inception_prec'] = '9'
        if act_range:
            output_dict['inception_earliest_date_val'] = generate_date_string(act_start_date, act_bce)
            if act_century:
                output_dict['inception_earliest_date_prec'] = '7'
            else:
                output_dict['inception_earliest_date_prec'] = '9'
            output_dict['inception_latest_date_val'] = generate_date_string(act_end_date, act_bce)
            if act_century:
                output_dict['inception_latest_date_prec'] = '7'
            else:
                output_dict['inception_latest_date_prec'] = '9'
        else:
            output_dict['inception_earliest_date_val'] = ''
            output_dict['inception_earliest_date_prec'] = ''
            output_dict['inception_latest_date_val'] = ''
            output_dict['inception_latest_date_prec'] = ''
            
    if not act_found and commons_found:
        output_dict['inception_ref1_referenceUrl'] = commons_url
        output_dict['inception_ref1_retrieved_val'] = today
        output_dict['inception_val'] = generate_date_string(commons_date, commons_bce)
        if commons_circa:
            output_dict['inception_sourcing_circumstances'] = 'Q5727902'
        else:
            output_dict['inception_sourcing_circumstances'] = ''
        output_dict['inception_prec'] = commons_precision
        if act_range:
            output_dict['inception_earliest_date_val'] = generate_date_string(commons_start_date, commons_start_bce)
            output_dict['inception_earliest_date_prec'] = commons_start_precision
            output_dict['inception_latest_date_val'] = generate_date_string(commons_end_date, act_bce)
            output_dict['inception_latest_date_prec'] = commons_end_precision
        else:
            output_dict['inception_earliest_date_val'] = ''
            output_dict['inception_earliest_date_prec'] = ''
            output_dict['inception_latest_date_val'] = ''
            output_dict['inception_latest_date_prec'] = ''
        
    # In the event that dates are available from both sources, use the ACT date data, but flag
    # as a potential error if any of the dates disagree.
    if act_found and commons_found:
        output_dict['inception_ref1_referenceUrl'] = act_url
        output_dict['inception_ref1_retrieved_val'] = today
        # Inconsistency check for CE/BCE
        if act_bce != commons_bce:
            issue_log += act_id + ' | ' + filename + ' | Disagreement between ACT and Commons on CE/BCE.\n'
        
        # Check for mismatch between the primary inception date of ACT and Commons
        if act_date != commons_date:
            issue_log += act_id + ' | ' + filename + ' | ACT inception: ' + str(act_date) + ', Commons inception: ' + str(commons_date) + '\n'
        output_dict['inception_val'] = generate_date_string(act_date, act_bce)
        
        # Inconsistency check for circa
        if act_circa != commons_circa:
            issue_log += act_id + ' | ' + filename + ' | Disagreement between ACT and Commons on circa.\n'
        if act_circa:
            output_dict['inception_sourcing_circumstances'] = 'Q5727902'
        else:
            output_dict['inception_sourcing_circumstances'] = ''
            
        if act_century:
            act_precision = '7'
        else:
            act_precision = '9'
            
        # Perform a precision consistency check. 
        if act_precision != commons_precision:
            issue_log += act_id + ' | ' + filename + ' | ACT precision: ' + act_precision + ', Commons precision: ' + commons_precision + '\n'
        output_dict['inception_prec'] = act_precision
        
        if act_range:
            # Perform date range consistency check
            if not(commons_start_date == 0 and commons_end_date == 0): # Skip check if no Commons range
                if not(act_start_date == commons_start_date and act_end_date == commons_end_date):
                    issue_log += act_id + ' | ' + filename + ' | ACT date range: ' + str(act_start_date) + '-' + str(act_end_date) + ', Commons date range: ' + str(commons_start_date) + '-' + str(commons_end_date) + '\n'
            output_dict['inception_earliest_date_val'] = generate_date_string(act_start_date, act_bce)
            output_dict['inception_earliest_date_prec'] = act_precision
            output_dict['inception_latest_date_val'] = generate_date_string(act_end_date, act_bce)
            output_dict['inception_latest_date_prec'] = act_precision
        else:
            output_dict['inception_earliest_date_val'] = ''
            output_dict['inception_earliest_date_prec'] = ''
            output_dict['inception_latest_date_val'] = ''
            output_dict['inception_latest_date_prec'] = ''
            

    # Check dates for reasonableness
    if output_dict['inception_val'] > today:
        issue_log += act_id + ' | ' + filename + ' | Inception date occurs in the future. Date: ' + output_dict['inception_val'] + '\n'
    if output_dict['inception_earliest_date_val'] > output_dict['inception_latest_date_val']:
        issue_log += act_id + ' | ' + filename + ' | Final date in range before initial date: ' + output_dict['inception_earliest_date_val'] + ', ' + output_dict['inception_latest_date_val'] + '\n'

    return output_dict, issue_log

def process_labels(filename, commons_data_type, commons_data):
    languages = [
        {'string':'English', 'code': 'en'},
        {'string':'Русский', 'code': 'ru'},
        {'string':'Deutsch', 'code': 'de'},
        {'string':'Français', 'code': 'fr'},
        {'string':'Ελληνικά', 'code': 'el'},
        {'string':'Norsk bokmål', 'code': 'nb'},
        {'string':'Português', 'code': 'pt'},
        {'string':'Italiano', 'code': 'it'},
        {'string':'Español', 'code': 'es'},
        {'string':'עברית', 'code': 'he'},
        {'string':'Català', 'code': 'ca'},
        {'string':'Slovenščina', 'code': 'sl'},
        {'string':'Hrvatski', 'code': 'hr'},
        {'string':'Svenska', 'code': 'sv'},
        {'string':'Српски / srpski', 'code': 'sr'},
        {'string': '中文', 'code': 'zh-Hans'}
         ]

    # Look up the title in the Commons data. For information template, it's usually description; for artwork: title
    commons_title_list = []
    if commons_data_type != 'none':
        if commons_data_type == 'artwork':
            title_field = 'title'
        else:
            title_field = 'description'
        commons_title_list = json.loads(commons_data.loc[commons_data.filename == filename, title_field].values[0])
    #print(act_title, commons_title_list)
    
    work_language_strings = []
    build_string = ''
    language_code = 'en' # assume starting with English if no explicit tag
    # Examine all of the blobs of the Commons language list
    for blob in commons_title_list:
        found = False
        for language in languages:
            if language['string'] in blob:
                found = True
                found_code = language['code']
                break
        if found:
            # close off previous language string
            work_language_strings.append({'lang': language_code, 'title': build_string.strip()})
            # start new build string
            language_code = found_code
            build_string = ''
        else:
            if language_code == 'zh-Hans': # don't put spaces between Chinese characters
                build_string += blob
            else:
                if blob[0] == ',': # don't put a space before a comma
                    build_string += blob
                else:
                    build_string += ' ' + blob # otherwise, put spaces between the concatenated blobs
    # When through the entire list, finish off the last string
    work_language_strings.append({'lang': language_code, 'title': build_string.strip()})
    # If the very first blob was a language tag and nothing was added to the initial string, then delete it
    if work_language_strings[0]['title'] == '':
        work_language_strings = work_language_strings[1:]
            
    return work_language_strings

def process_type(act_id, act_type_string, commons_type_string, issue_log):
    types = [
        {'string': 'drawing', 'qid': 'Q93184'},
        {'string': 'painting', 'qid': 'Q3305213'},
        {'string': 'manuscript', 'qid': 'Q87167'},
        {'string': 'sculpture', 'qid': 'Q860861'},
        {'string': 'carving', 'qid': 'Q97570030'},
        {'string': 'textile', 'qid': 'Q28823'},
        {'string': 'garment', 'qid': 'Q11460'},
        {'string': 'photograph', 'qid': 'Q125191'},
        {'string': 'architecture', 'qid': 'Q811979'},
        {'string': 'mosaic', 'qid': 'Q133067'},
        {'string': 'fresco', 'qid': 'Q22669139'},
        {'string': 'print', 'qid': 'Q11060274'},
        {'string': 'mural', 'qid': 'Q219423'},
        {'string': 'watercolor', 'qid': 'Q3305213'}
         ]

    #print(act_type_string, commons_type_string)
    commons_type_list = json.loads(commons_type_string)
    qid = ''
    type_string = ''
    data_source = ''
    # Look for type in ACT data
    found = False
    for kind in types:
        if kind['string'] in act_type_string.lower():
            found = True
            type_string = kind['string']
            qid = kind['qid']
            data_source = 'act'
            break
    if not found:
        found = False
        for kind in types:
            for blob in commons_type_list:
                if kind['string'] in blob.lower():
                    found = True
                    type_string = kind['string']
                    qid = kind['qid']
                    data_source = 'commons'
                    break
    if not found:
        issue_log += act_id + ' | Could not identify type (instance of)\n'
        
    if type_string == 'architecture':
        type_string = 'architectural structure'
    
    return type_string, qid, issue_log, data_source

# *** TODO: need to output information that was discovered that might be used to create artist items
# Also, make use of any information found about photographers of 3D objects
def process_artists(act_id, act_artist_string, filename, commons_data_type, commons_data, issue_log):
    # Look up the artist in the Commons data. For information template, it's author; for artwork: artist
    commons_artist_list = []
    if commons_data_type != 'none':
        if commons_data_type == 'artwork':
            creator_field = 'artist'
        else:
            creator_field = 'author'
        commons_artist_list = json.loads(commons_data.loc[commons_data.filename == filename, creator_field].values[0])
    #print(act_artist_string, commons_artist_list)
    #print()
    
    # Dissect the ACT artist string
    # Note: currently not really making use of given_name and family_name
    unidentified = False
    approximate = False
    if act_artist_string == '':
            given_name = ''
            family_name = ''
            name = ''
            dates_string = ''
            birth_date = ''
            death_date = ''
    else:
        pieces = act_artist_string.split(',')
        #print(pieces)
        if len(pieces) == 1:
            given_name = ''
            family_name = ''
            name = act_artist_string
            dates_string = ''
        elif len(pieces) == 2:
            given_name = pieces[1].strip()
            family_name = pieces[0].strip()
            name = given_name + ' ' + family_name
            dates_string = ''
        elif len(pieces) == 3:
            given_name = pieces[1].strip()
            family_name = pieces[0].strip()
            name = given_name + ' ' + family_name
            dates_string = pieces[2].strip()
        else:
            given_name = pieces[1].strip()
            family_name = pieces[0].strip()
            name = given_name + ' ' + family_name
            dates_string = pieces[2].strip()
            issue_log += act_id + ' | Check ACT artist name (more than 4 parts)\n'
            
        if 'unidentified' in name.lower():
            unidentified = True
            name = ''
        
        # Process artist dates
        if dates_string == '':
            birth_date = ''
            death_date = ''
        else:
            if 'approximately' in dates_string:
                dates_string = dates_string.split('approximately')[1].strip()
                approximate = True
                
            if '-' in dates_string:
                pieces = dates_string.split('-')
                birth_date = pieces[0]
                death_date = pieces[1]
            else:
                try:
                    if 'd' in dates_string:
                        death_date = dates_string.split(' ')[1]
                        birth_date = ''
                    elif 'b' in dates_string:
                        birth_date = dates_string.split(' ')[1]
                        death_date = ''
                    else:
                        birth_date = ''
                        death_date = ''
                except:
                    birth_date = ''
                    death_date = ''

    #print('given:', given_name, '/ family:', family_name, '/ name:', name, '/ unidentified:', unidentified)
    #print('born:', birth_date, '/ died:', death_date, '/ approx:', approximate)
    #print()
        
    # Process Commons creator data
    unknown = False
    if len(commons_artist_list) == 0:
        commons_name = ''
        commons_photographer = ''
    else:
        first_blob = commons_artist_list[0].strip().lower()
        if 'anonymous' in first_blob or 'unknown' in first_blob:
            unknown = True
            commons_name = ''
            commons_photographer = ''
        else:
            if len(commons_artist_list) == 1:
                commons_name = commons_artist_list[0].strip()
                commons_photographer = ''
            else:
                if 'photograph' in first_blob:
                    commons_name = ''
                    commons_photographer = commons_artist_list[1].strip()
                else:
                    commons_name = commons_artist_list[0].strip()
                    commons_photographer = ''
    
        # If the Commons name has a comma, then it needs to be reversed
        #if ',' in commons_name:
        #    pieces = commons_name.split(',')
        #    commons_name = pieces[1].strip() + ' ' + pieces[0].strip()
    
    #print('commons name:', commons_name, '/ photographer:', commons_photographer, '/ unknown:', unknown)
    
    #print('act name:', name)
    #print('commons name:', commons_name)

    # Reconcile names, attempt to match with Wikidata item, and prepare data to return from function
    anonymous = False
    source = ''
    name_string = ''
    
    if name != '' and commons_name != '': # case where both names are known
        source = 'act'
        name_string = name
        #ratio = fuzz.ratio(name, commons_name)
        #partial_ratio = fuzz.partial_ratio(name, commons_name)
        #sort_ratio = fuzz.token_sort_ratio(name, commons_name)
        set_ratio = fuzz.token_set_ratio(name, commons_name)
        #w_ratio = fuzz.WRatio(name, commons_name)
        #print('name similarity ratio', ratio)
        #print('partial ratio', partial_ratio)
        #print('sort_ratio', sort_ratio)
        #print('set_ratio', set_ratio)
        #print('w_ratio', w_ratio)
        
        if set_ratio < 95:
            issue_log += act_id + ' | Low similarity ratio: ' + str(set_ratio) + ' between ACT name: ' + name + ' and Commons name: ' + commons_name + '\n'
            search_names = [name] # if disagreement, go with the ACT name
        else:
            # If the names are similar, use both in search
            search_names = [name, commons_name]
    elif name == '' and commons_name == '': # case where neither name is known
        search_names = []
        if unknown or unidentified: # Only set as anonymous if asserted by either ACT or Commons
            anonymous = True
            name_string = 'artist unknown'
            if unknown:
                source = 'commons'
            else:
                source = 'act'
    elif commons_name == '':
        search_names = [name]
        name_string = name
        source = 'act'
        if unknown: # Commons says doesn't know name but there is an ACT name
            issue_log += act_id + ' | Commons says name unknown, but ACT gives name: ' + name
    else:
        search_names = [commons_name]
        name_string = commons_name
        source = 'commons'
        if unidentified: # ACT says doesn't know name but there is an Commons name
            issue_log += act_id + ' | ACT says author unidentified, but Commons gives name: ' + commons_name

    artist_qid = ''
    unmatched = {}
    if search_names != []:
        # Search Wikidata for variants of the name
        hits = search_name_at_wikidata(search_names) # Searching a list of names is a hack of the original function

        # Perform screens to eliminate items that aren't human, etc.
        qids = []
        for hit in hits:
            qids.append(hit['qid'])
        return_list = screen_qids(qids, screens)
        #print(return_list)

        # Try to match birth or death dates with what's in Wikidata for the candidate matches
        matched = False
        for artist in return_list:
            #print(artist['qid'])
            birth_date_list = retrieve_date_wikidata(artist['qid'],'P569')
            if len(birth_date_list) >= 1:
                wd_birth_date = birth_date_list[0][0:4]
            else:
                wd_birth_date = ''

            death_date_list = retrieve_date_wikidata(artist['qid'],'P570')
            if len(death_date_list) >= 1:
                wd_death_date = death_date_list[0][0:4]
            else:
                wd_death_date = ''

            if birth_date != '' and birth_date == wd_birth_date:
                matched = True
            if death_date !='' and death_date == wd_death_date:
                matched = True

            if matched:
                artist_qid = artist['qid']
                break
        
        if len(return_list) > 0 and not matched:
            print('No match with possible names:', return_list)
            unmatched = {
                'search_names': search_names,
                'possible_matches': return_list
            }
        
    return artist_qid, name_string, source, anonymous, issue_log, death_date, unmatched


## Extract/clean inception date

The dates given in ACT and Commons can have several characteristics:

- beginning and ending ranges
- circa. Designated very consistently as "ca." in ACT. 
- century designation (essentially setting the precision to the century level)

The Commons data also sometimes is structured using Wikidata date Q IDs, qualifiers, and standard xsd:dateTime format:
- Starts with `QS:P571` (inception)
- Sometimes has `P1319` (earliest date) and `P1326` (latest date)
- Sometimes had `P580` (start time) and `P582` (end time)
- Sometimes has qualifier `P1480` (sourcing circumstances) with value `Q5727902` (circa)
- Date precisions can be 7 (century), 8 (decade), or 9 (year)


In [None]:
entire_issue_log = ''
unidentified_artists = []

# If there are any existing data in VanderBot format, load it (or just the file headers)
output_list = read_dicts_from_csv('abstract_artworks.csv')
fieldnames = [
    'commons_template',
    'label_en',
    'label_commons',
    'description_en',
    'act_uuid',
    'act',
    'act_ref1_hash',
    'act_ref1_retrieved_nodeId',
    'act_ref1_retrieved_val',
    'act_ref1_retrieved_prec',
    'inventory_number_uuid',
    'inventory_number',
    'inventory_number_collection',
    'inventory_number_ref1_hash',
    'inventory_number_ref1_referenceUrl',
    'inventory_number_ref1_retrieved_nodeId',
    'inventory_number_ref1_retrieved_val',
    'inventory_number_ref1_retrieved_prec',
    'title_uuid',
    'title',
    'title_ref1_hash',
    'title_ref1_referenceUrl',
    'title_ref1_retrieved_nodeId',
    'title_ref1_retrieved_val',
    'title_ref1_retrieved_prec',
    'creator_uuid',
    'creator',
    'creator_object_has_role,'
    'creator_ref1_hash',
    'creator_ref1_referenceUrl',
    'creator_ref1_retrieved_nodeId',
    'creator_ref1_retrieved_val',
    'creator_ref1_retrieved_prec',
    'instance_of_uuid',
    'instance_of',
    'instance_of_ref1_hash',
    'instance_of_ref1_referenceUrl',
    'instance_of_ref1_retrieved_nodeId',
    'instance_of_ref1_retrieved_val',
    'instance_of_ref1_retrieved_prec',
    'inception_uuid',
    'inception_nodeId',
    'inception_val',
    'inception_prec',
    'inception_earliest_date_nodeId',
    'inception_earliest_date_val',
    'inception_earliest_date_prec',
    'inception_latest_date_nodeId',
    'inception_latest_date_val',
    'inception_latest_date_prec',
    'inception_sourcing_circumstances',
    'inception_ref1_hash',
    'inception_ref1_referenceUrl',
    'inception_ref1_retrieved_nodeId',
    'inception_ref1_retrieved_val',
    'inception_ref1_retrieved_prec',
    'country_of_origin_uuid',
    'country_of_origin',
    'country_of_origin_ref1_hash',
    'country_of_origin_ref1_referenceUrl',
    'country_of_origin_ref1_retrieved_nodeId',
    'country_of_origin_ref1_retrieved_val',
    'country_of_origin_ref1_retrieved_prec',
    'copyright_status_uuid',
    'copyright_status',
    'copyright_status_applies_to_jurisdiction',
    'copyright_status_determination_method',
    'copyright_status_ref1_hash',
    'copyright_status_ref1_referenceUrl',
    'copyright_status_ref1_retrieved_nodeId',
    'copyright_status_ref1_retrieved_val',
    'copyright_status_ref1_retrieved_prec',
    'image_uuid',
    'image',
    'image_ref1_hash',
    'image_ref1_referenceUrl',
    'image_ref1_retrieved_nodeId',
    'image_ref1_retrieved_val',
    'image_ref1_retrieved_prec',
    'collection_uuid',
    'collection',
    'collection_ref1_hash',
    'collection_ref1_referenceUrl',
    'collection_ref1_retrieved_nodeId',
    'collection_ref1_retrieved_val',
    'collection_ref1_retrieved_prec'
]

labels_data = []

for index, work in act_data.iterrows():
    issue_log = ''
    output_dict = {}
    act_id = work['RecordNumber']
    act_url = 'http://diglib.library.vanderbilt.edu/act-imagelink.pl?RC=' + act_id
    
    # Look up the Commons page URL
    commons_url = ids.loc[ids.RecordNumber == act_id, 'commons_page_url'].values[0]
    
    # Look up the Commons filename in the IDs file using the ACT ID
    # This seems inefficient to have to look up the row twice, but I suppose Pandas is
    # efficient enough with a small dataset like this that it doesn't matter.
    filename = ids.loc[ids.RecordNumber == act_id, 'filename'].values[0]

    # Determine whether the information or artwork template was used, or if Commons data not available
    commons_date_type_series = commons_data.loc[commons_data.filename == filename, 'template_type']    
    if len(commons_date_type_series) == 1:
        commons_data_type = commons_date_type_series.values[0]
    else:
        commons_data_type = 'none'
    output_dict['commons_template'] = commons_data_type

    print('commons data type:', commons_data_type)
    
    # -----------------------------------
    # *** Process title information to generate English label ***
    
    act_title = work['Title'].strip()
    
    print(act_title)
    
    # Extract information about titles/labels from Commons data
    work_language_strings = process_labels(filename, commons_data_type, commons_data)
    
    # Attach the extracted language data to the list for later saving
    labels_data.append({'act_id': act_id, 'act_title': act_title, 'commons_data_type': commons_data_type, 'commons_language_strings': work_language_strings})
    
    # Add discovered label data to the output
    output_dict['label_en'] = act_title # Use the ACT title as the default
    
    commons_label = ''
    for work_language in work_language_strings:
        if work_language['lang'] == 'en':
            commons_label = work_language['title'].strip()
    output_dict['label_commons'] = commons_label
    
    # -----------------------------------
    # *** Other fields ***
    
    # Create ACT ID output
    output_dict['act'] = act_id
    output_dict['act_ref1_retrieved_val'] = today
    
    # Create Commons image output
    # VanderBot will convert the raw, unencoded file name into the appropriate IRI for Wikidata
    output_dict['image'] = filename
    output_dict['image_ref1_referenceUrl'] = act_url
    output_dict['image_ref1_retrieved_val'] = today
    
    # -----------------------------------
    # *** Process inception dates ***
    
    # Get the ACT date string value
    act_date_string = work['DateCreation'].strip()

    # Look up the date value in the Commons data
    if commons_data_type != 'none':
        commons_date_string = commons_data.loc[commons_data.filename == filename, 'date'].values[0]
    else:
        commons_date_string = '[]' # case where no commons data exists for this work

    #print(act_date_string, commons_date_string)
    
    output_dict, issue_log = process_dates(act_id, output_dict, issue_log, act_url, commons_url, act_date_string, commons_date_string)
    
    # -----------------------------------
    # *** Determine type (instance of) ***
    
    act_type_string = work['ObjectFunction'].strip()
    
    # Look up the object type value in the Commons data
    if commons_data_type != 'none':
        commons_type_string = commons_data.loc[commons_data.filename == filename, 'object type'].values[0]
    else:
        commons_type_string = '[]' # case where no commons data exists for this work
    type_string, instance_of_qid, issue_log, data_source = process_type(act_id, act_type_string, commons_type_string, issue_log)
    #print(type_string, instance_of_qid)
    
    output_dict['instance_of'] = instance_of_qid
    if data_source == 'act':
        output_dict['instance_of_ref1_referenceUrl'] = act_url
    elif data_source == 'commons':
        output_dict['instance_of_ref1_referenceUrl'] = commons_url
    else:
        output_dict['instance_of_ref1_referenceUrl'] = ''
    if data_source != '':
        output_dict['instance_of_ref1_retrieved_val'] = today
    else:
        output_dict['instance_of_ref1_retrieved_val'] = ''

    print('work type:', type_string)

    # -----------------------------------
    # *** Determine artist ***
    
    act_artist_string = work['CreatorArtist'].strip()
    artist_qid, name_string, data_source, anonymous, issue_log, death_date, unmatched = process_artists(act_id, act_artist_string, filename, commons_data_type, commons_data, issue_log)
    
    output_dict['creator'] = artist_qid
    if anonymous:
        output_dict['creator'] = 'anon'
    if output_dict['creator'] != '':
        if data_source == 'act':
            output_dict['creator_ref1_referenceUrl'] = act_url
        elif data_source == 'commons':
            output_dict['creator_ref1_referenceUrl'] = commons_url
        else:
            output_dict['creator_ref1_referenceUrl'] = ''
        if data_source != '':
            output_dict['creator_ref1_retrieved_val'] = today
        else:
            output_dict['creator_ref1_retrieved_val'] = ''
            
    # If the artist was unidentified and there were possible matches, add them to the list
    if unmatched != {}:
        unidentified_artists.append(unmatched)
        
    # -----------------------------------
    # *** Generate description string ***
    
    description = ''
    if type_string != '':
        description += type_string
    if name_string != '':
        description += ' by ' + name_string
    output_dict['description_en'] = description
    
    # -----------------------------------
    # *** Find out if the work is Public Domain ***
    
    act_copyright_string = work['CopyrightStatus'].strip()
    
    # Look up the date value in the Commons data
    if commons_data_type != 'none':
        commons_permission_string = commons_data.loc[commons_data.filename == filename, 'permission'].values[0]
    else:
        commons_permission_string = '[]' # case where no commons data exists for this work
     
    # Start off with no values and overwrite as discovered
    output_dict['copyright_status'] = ''
    output_dict['copyright_status_applies_to_jurisdiction'] = ''
    output_dict['copyright_status_determination_method'] = ''
    output_dict['copyright_status_ref1_referenceUrl'] = ''
    output_dict['copyright_status_ref1_retrieved_val'] = ''

    # Determine years since death of artist, if known
    if death_date != '':
        #print(today, death_date)
        try:
            years_since_death = int(today[:4]) - int(death_date)
        except:
            years_since_death = 0
        if years_since_death > 100:
            output_dict['copyright_status'] = 'Q19652' # Public Domain
            output_dict['copyright_status_applies_to_jurisdiction'] = 'Q60332278' # countries with 100 years pma or shorter
            output_dict['copyright_status_determination_method'] = 'Q29940705' # 100 years or more after author's death
        
    if 'public domain' in act_copyright_string.lower() or 'public domain' in commons_permission_string.lower():
        if 'public domain' in act_copyright_string.lower():
            copyright_ref = act_url
        else:
            copyright_ref = commons_url
        # Determination method will be left blank since we don't know how the sources decided this
        output_dict['copyright_status'] = 'Q19652' # OK to write over value if already determined from dates
        output_dict['copyright_status_ref1_referenceUrl'] = copyright_ref
        output_dict['copyright_status_ref1_retrieved_val'] = today
        
    # -----------------------------------
    # *** Determine country of origin ***
    
    # Default to no values
    output_dict['country_of_origin'] = ''
    output_dict['country_of_origin_ref1_referenceUrl'] = ''
    output_dict['country_of_origin_ref1_retrieved_val'] = ''

    country_string = work['LocationCountry'].strip()
    if country_string != '' and work['OriginalLocation'] == '': # override if there is an original location (rare)
        country_qid_series = country_mappings.loc[country_mappings.string == country_string, 'qid']
        if len(country_qid_series) == 1: # must be at least one match
            output_dict['country_of_origin'] = country_qid_series.values[0]
            output_dict['country_of_origin_ref1_referenceUrl'] = act_url
            output_dict['country_of_origin_ref1_retrieved_val'] = today
    
    # -----------------------------------
    # Look up the collection from Commons data if it exists
    
    if commons_data_type != 'none':
        commons_collection_string = commons_data.loc[commons_data.filename == filename, 'collection'].values[0]
    else:
        commons_collection_string = '[]' # case where no commons data exists for this work
    commons_collection_list = json.loads(commons_collection_string)
    #print(commons_collection_list)

    # Default values to empty string
    output_dict['collection'] = ''
    output_dict['collection_ref1_referenceUrl'] = ''
    output_dict['collection_ref1_retrieved_val'] = ''
    collection_qid = ''
    
    if len(commons_collection_list) > 0:
        if ':' in commons_collection_list[0]: # for cases where the language is the first item, e.g. "Deutsch:"
            try: # need to error trap because in rare cases there is only a collection name and it includes a colon
                collection_name = commons_collection_list[1]
            except:
                collection_name = commons_collection_list[0]
        else: # otherwise, the name is listed first
            collection_name = commons_collection_list[0]

        # Look up the collection Q ID
        collection_series = collections_mappings.loc[collections_mappings.name == collection_name, 'qid']
        if len(collection_series) == 1: # a single match is found
            collection_qid = collection_series.values[0]
            output_dict['collection'] = collection_qid
            output_dict['collection_ref1_referenceUrl'] = commons_url
            output_dict['collection_ref1_retrieved_val'] = today
        else:
            pass # No match is found (or perhaps multiple, but avoid that in the collections.csv file)
    
    # -----------------------------------
    # Look up the accession number from Commons data if it exists

    if commons_data_type != 'none':
        commons_accession_string = commons_data.loc[commons_data.filename == filename, 'accession number'].values[0]
    else:
        commons_accession_string = '[]' # case where no commons data exists for this work
    commons_accession_list = json.loads(commons_accession_string)

    # Default values to empty string
    output_dict['inventory_number'] = ''
    output_dict['inventory_number_collection'] = ''
    output_dict['inventory_number_ref1_referenceUrl'] = ''
    output_dict['inventory_number_ref1_retrieved_val'] = ''
    
    # Only supply the inventory number if it can be associated with a collection
    if collection_qid != '':
        # If there is a clean accession number, it's generally the first item on the list
        if len(commons_accession_list) > 0:
            output_dict['inventory_number'] = commons_accession_list[0]
            output_dict['inventory_number_collection'] = collection_qid
            output_dict['inventory_number_ref1_referenceUrl'] = commons_url
            output_dict['inventory_number_ref1_retrieved_val'] = today
    
    # -----------------------------------
    if issue_log != '': # include extra blank line only if some issues were added for this work
        issue_log += '\n'
    entire_issue_log += issue_log
    print()
    output_list.append(output_dict)
    
#print(json.dumps(labels_data, indent =2, ensure_ascii=False))

write_dicts_to_csv(output_list, output_directory + 'abstract_artworks_out.csv', fieldnames)

# Save the data for multilingual labels for future use
with open(output_directory + 'language_labels.json', 'wt', encoding='utf8') as file_object:
    json.dump(labels_data, file_object, indent = 2, ensure_ascii=False)
    
# Save the issue log in a file
with open(output_directory + 'issue_log.txt', 'wt', encoding='utf-8') as file_object:
    file_object.write(entire_issue_log)
    
# Save the potential artist matches in a file
with open(output_directory + 'unidentified_artists.json', 'wt', encoding='utf-8') as file_object:
    file_object.write(json.dumps(unidentified_artists, indent=2))

print('done')


In [None]:
print(entire_issue_log)