# Script to create Wikidata items for ACT artwork

Initially, this will be for works that are already in Commons, but for which the links to Commons are from non-artwork Wikidata items. Eventually, we can modify this for any work that's in Commons and doesn't have a Wikidata item.

## Properties whose values need to be cleaned/generated

Taken from [here](https://github.com/HeardLibrary/vandycite/blob/master/act/processed_lists/candidate_properties_to_write.csv).

P571 (inception): get from ACT "DateCreation" and Commons "date" fields

## Configuration section

Run once at the start


In [None]:
# Import modules
import json
import csv
import math
import datetime
# Pandas for data frame management
import pandas as pd

# Load data from the two sources (ACT database dump and Commons Mediawiki table scrape)
act_data = pd.read_csv('act_data_fix.csv', na_filter=False, dtype = str)
commons_data = pd.read_csv('commons_data_fix.csv', na_filter=False, dtype = str)
ids = pd.read_csv('clean_ids.csv', na_filter=False, dtype = str)

# For testing purposes, just use the first few rows
#act_data = act_data.head(100).copy()
#commons_data = commons_data.head(100).copy()

# --------------------
# Low-level functions
# --------------------

def generate_utc_date():
    whole_time_string_z = datetime.datetime.utcnow().isoformat() # form: 2019-12-05T15:35:04.959311
    date_z = whole_time_string_z.split('T')[0] # form 2019-12-05
    return date_z

today = generate_utc_date()

# Read from a CSV file on disk into a list of dictionaries (representing a table)
def read_dicts_from_csv(filename):
    with open(filename, 'r', newline='', encoding='utf-8') as file_object:
        dict_object = csv.DictReader(file_object)
        table = []
        for row in dict_object:
            table.append(row)
    return table

# write a list of dictionaries to a CSV file
def write_dicts_to_csv(table, filename, fieldnames):
    with open(filename, 'w', newline='', encoding='utf-8') as csv_file_object:
        writer = csv.DictWriter(csv_file_object, fieldnames=fieldnames)
        writer.writeheader()
        for row in table:
            writer.writerow(row)

def pad_zeros_left(date_string):
    length = len(date_string)
    pad = 4-length
    return '0' * pad + date_string

def generate_date_string(date, bce):
    if bce:
        date_string = '-'
    else:
        date_string = ''
    date_string += pad_zeros_left(str(date)) + '-01-01T00:00:00Z'
    return date_string

# ----------------------------
# Intermediate-level functions
# ----------------------------

# Parse the ACT date string into structured components
def process_act_date(act_date):
    act_circa = False
    act_range = False
    act_century = False
    non_numeric = False
    date = 0
    start_date = 0
    end_date = 0
    
    # If there is no date from ACT, kill the function and return False
    if act_date == '':
        return False, date, act_range, start_date, end_date, act_century, act_circa
    
    # Determine circa status of ACT date
    if 'ca.' in act_date:
        act_circa = True
        # Remove the "ca." from the beginning and clean whitespace
        act_date = act_date.split('ca.')[1].strip()
    
    # Test whether the ACT date is a number
    try:
        date = int(act_date)
        #print('numeric date:', date)
    except:
        non_numeric = True
        #print('non-numeric string:', act_date)
        
    if non_numeric:
        # Determine century status of ACT date
        if 'century' in act_date: # single century date
            act_century = True
            # Remove the "century" and "th", "rd", "st", etc. from the end
            act_date = act_date[:-10]
            non_numeric = False
            try:
                date = int(act_date) * 100 - 50 # set the date at mid-century
            except:
                print('numeric conversion error on', act_date)
        elif 'centuries' in act_date:
            act_century = True
            act_range = True
            # Remove the "centuries" and "th", "rd", "st", etc. from the end
            act_date = act_date[:-10].strip()
            try:
                pieces = act_date.split('-')
                start_date = int(pieces[0][:-2]) * 100 - 50 # set the date at mid-century
                end_date = int(pieces[1][:-2]) * 100 - 50 # set the date at mid-century
            except:
                print('error in processing century range')
    # Process date ranges (non-numeric because they include "-")
    if non_numeric and not act_century:
        #print(act_date)
        try:
            pieces = act_date.split('-')
            start_date = int(pieces[0])
            end_date = int(pieces[1])
            act_range = True
        except:
            print('error in processing date range')

        
    # if there is a range of dates, set the single date as the midpoint
    if start_date != 0 or end_date != 0:
        date = math.floor((start_date + end_date)/2)
            
    return True, date, act_range, start_date, end_date, act_century, act_circa
    
# Disassemble Wikibase-style dateTime strings into year, precision, and BCE components
def extract_from_iso_date_string(string):
    pieces = string.split('/')
    # precision comes after the slash in the Wikibase format
    precision = pieces[1]
    # check for negative sign for BCE dates
    if pieces[0][0] == '-':
        bce = True
    else:
        bce = False
    no_sign_dateTime = pieces[0][1:] # skip sign
    pieces = no_sign_dateTime.split('-')
    year = pieces[0]
    return int(year), precision, bce

# Parse any structured date data that was scraped from the Commons Mediawiki table
def process_commons_date(commons_date_string):
    # Set all values to defaults to return something even if they aren't determined from the data
    commons_circa = False
    commons_range = False
    date = 0
    precision = '9'
    bce = False
    start_date = 0
    start_precision = '9'
    start_bce = False
    end_date = 0
    end_precision = '9'
    end_bce = False

    commons_date_list = json.loads(commons_date_string)
    found = False
    for string in commons_date_list:
        # Find the part of the extracted metadata that includes the structured data
        if 'date QS' in string:
            found = True
            pieces = string.split(',') # split into fields by comma
            pieces = pieces[1:] # get rid of initial "inception field"
            date, precision, bce = extract_from_iso_date_string(pieces[0])
            # Remove the initial date from the list
            pieces = pieces[1:]
            
            # Check for circa
            if len(pieces) >= 2:
                # Check if last piece is "circa"
                if pieces[len(pieces)-1] == 'Q5727902':
                    commons_circa = True
                    # Remove the last two items from the list
                    pieces = pieces[:-2]
            #print(commons_circa, pieces)
            
            # Extract start date (if any)
            if len(pieces) > 0 and (pieces[0] == 'P1319' or pieces[0] == 'P580'): # check for earliest date or start time
                commons_range = True
                start_date, start_precision, start_bce = extract_from_iso_date_string(pieces[1]) # start date follows the P ID
                # Remove the first two pieces
                if len(pieces) > 0:
                    pieces = pieces[2:]
            
            if len(pieces) > 0 and (pieces[0] == 'P1326' or pieces[0] == 'P582'): # check for latest date or end time
                commons_range = True
                end_date, end_precision, end_bce = extract_from_iso_date_string(pieces[1]) # start date follows the P ID
    return found, date, precision, bce, commons_range, start_date, start_precision, start_bce, end_date, end_precision, end_bce, commons_circa
    
# -------------------
# Top level functions
# -------------------
    
def process_dates(act_id, output_dict, issue_log, act_url, commons_url, act_date_string, commons_date_string):
    # Hard code ACT BCE to False, at least until it's determined whether any ACT dates
    # are designated as BCE or have negative signs.
    act_bce = False
    
    act_found, act_date, act_range, act_start_date, act_end_date, act_century, act_circa  = process_act_date(act_date_string)
    #print(act_date, act_start_date, act_end_date)
    
    commons_found, commons_date, commons_precision, commons_bce, commons_range, commons_start_date, commons_start_precision, commons_start_bce, commons_end_date, commons_end_precision, commons_end_bce, commons_circa = process_commons_date(commons_date_string)
    #print(commons_date, commons_start_date, commons_end_date)
    
    # Perform quality control and determine output values
    if not act_found and not commons_found:
        issue_log += act_id + ' | ' + filename + ' | no dates retrieved. Commons data: ' + commons_date_string + '\n'
        output_dict['inception_ref1_referenceUrl'] = ''
        output_dict['inception_ref1_retrieved_val'] = ''
        output_dict['inception_val'] = ''
        output_dict['inception_sourcing_circumstances'] = ''
        output_dict['inception_prec'] = ''
        output_dict['inception_earliest_date_val'] = ''
        output_dict['inception_earliest_date_prec'] = ''
        output_dict['inception_latest_date_val'] = ''
        output_dict['inception_latest_date_prec'] = ''
        
    if act_found and not commons_found:
        output_dict['inception_ref1_referenceUrl'] = act_url
        output_dict['inception_ref1_retrieved_val'] = today
        output_dict['inception_val'] = generate_date_string(act_date, act_bce)
        if act_circa:
            output_dict['inception_sourcing_circumstances'] = 'Q5727902'
        else:
            output_dict['inception_sourcing_circumstances'] = ''
        if act_century:
            output_dict['inception_prec'] = '7'
        else:
            output_dict['inception_prec'] = '9'
        if act_range:
            output_dict['inception_earliest_date_val'] = generate_date_string(act_start_date, act_bce)
            if act_century:
                output_dict['inception_earliest_date_prec'] = '7'
            else:
                output_dict['inception_earliest_date_prec'] = '9'
            output_dict['inception_latest_date_val'] = generate_date_string(act_end_date, act_bce)
            if act_century:
                output_dict['inception_latest_date_prec'] = '7'
            else:
                output_dict['inception_latest_date_prec'] = '9'
        else:
            output_dict['inception_earliest_date_val'] = ''
            output_dict['inception_earliest_date_prec'] = ''
            output_dict['inception_latest_date_val'] = ''
            output_dict['inception_latest_date_prec'] = ''
            
    if not act_found and commons_found:
        output_dict['inception_ref1_referenceUrl'] = commons_url
        output_dict['inception_ref1_retrieved_val'] = today
        output_dict['inception_val'] = generate_date_string(commons_date, commons_bce)
        if commons_circa:
            output_dict['inception_sourcing_circumstances'] = 'Q5727902'
        else:
            output_dict['inception_sourcing_circumstances'] = ''
        output_dict['inception_prec'] = commons_precision
        if act_range:
            output_dict['inception_earliest_date_val'] = generate_date_string(commons_start_date, commons_start_bce)
            output_dict['inception_earliest_date_prec'] = commons_start_precision
            output_dict['inception_latest_date_val'] = generate_date_string(commons_end_date, act_bce)
            output_dict['inception_latest_date_prec'] = commons_end_precision
        else:
            output_dict['inception_earliest_date_val'] = ''
            output_dict['inception_earliest_date_prec'] = ''
            output_dict['inception_latest_date_val'] = ''
            output_dict['inception_latest_date_prec'] = ''
        
    # In the event that dates are available from both sources, use the ACT date data, but flag
    # as a potential error if any of the dates disagree.
    if act_found and commons_found:
        output_dict['inception_ref1_referenceUrl'] = act_url
        output_dict['inception_ref1_retrieved_val'] = today
        # Inconsistency check for CE/BCE
        if act_bce != commons_bce:
            issue_log += act_id + ' | ' + filename + ' | Disagreement between ACT and Commons on CE/BCE.\n'
        
        # Check for mismatch between the primary inception date of ACT and Commons
        if act_date != commons_date:
            issue_log += act_id + ' | ' + filename + ' | ACT inception: ' + str(act_date) + ', Commons inception: ' + str(commons_date) + '\n'
        output_dict['inception_val'] = generate_date_string(act_date, act_bce)
        
        # Inconsistency check for circa
        if act_circa != commons_circa:
            issue_log += act_id + ' | ' + filename + ' | Disagreement between ACT and Commons on circa.\n'
        if act_circa:
            output_dict['inception_sourcing_circumstances'] = 'Q5727902'
        else:
            output_dict['inception_sourcing_circumstances'] = ''
            
        if act_century:
            act_precision = '7'
        else:
            act_precision = '9'
            
        # Perform a precision consistency check. 
        if act_precision != commons_precision:
            issue_log += act_id + ' | ' + filename + ' | ACT precision: ' + act_precision + ', Commons precision: ' + commons_precision + '\n'
        output_dict['inception_prec'] = act_precision
        
        if act_range:
            # Perform date range consistency check
            if not(commons_start_date == 0 and commons_end_date == 0): # Skip check if no Commons range
                if not(act_start_date == commons_start_date and act_end_date == commons_end_date):
                    issue_log += act_id + ' | ' + filename + ' | ACT date range: ' + str(act_start_date) + '-' + str(act_end_date) + ', Commons date range: ' + str(commons_start_date) + '-' + str(commons_end_date) + '\n'
            output_dict['inception_earliest_date_val'] = generate_date_string(act_start_date, act_bce)
            output_dict['inception_earliest_date_prec'] = act_precision
            output_dict['inception_latest_date_val'] = generate_date_string(act_end_date, act_bce)
            output_dict['inception_latest_date_prec'] = act_precision
        else:
            output_dict['inception_earliest_date_val'] = ''
            output_dict['inception_earliest_date_prec'] = ''
            output_dict['inception_latest_date_val'] = ''
            output_dict['inception_latest_date_prec'] = ''
            

    # Check dates for reasonableness
    if output_dict['inception_val'] > today:
        issue_log += act_id + ' | ' + filename + ' | Inception date occurs in the future. Date: ' + output_dict['inception_val'] + '\n'
    if output_dict['inception_earliest_date_val'] > output_dict['inception_latest_date_val']:
        issue_log += act_id + ' | ' + filename + ' | Final date in range before initial date: ' + output_dict['inception_earliest_date_val'] + ', ' + output_dict['inception_latest_date_val'] + '\n'

    return output_dict, issue_log

def process_labels(filename, commons_data_type, commons_data):
    languages = [
        {'string':'English', 'code': 'en'},
        {'string':'Русский', 'code': 'ru'},
        {'string':'Deutsch', 'code': 'de'},
        {'string':'Français', 'code': 'fr'},
        {'string':'Ελληνικά', 'code': 'el'},
        {'string':'Norsk bokmål', 'code': 'nb'},
        {'string':'Português', 'code': 'pt'},
        {'string':'Italiano', 'code': 'it'},
        {'string':'Español', 'code': 'es'},
        {'string':'עברית', 'code': 'he'},
        {'string':'Català', 'code': 'ca'},
        {'string':'Slovenščina', 'code': 'sl'},
        {'string':'Hrvatski', 'code': 'hr'},
        {'string':'Svenska', 'code': 'sv'},
        {'string':'Српски / srpski', 'code': 'sr'},
        {'string': '中文', 'code': 'zh-Hans'}
         ]

    # Look up the title in the Commons data. For information template, it's usually description; for artwork: title
    commons_title_list = []
    if commons_data_type == 'none':
        commons_title = ''
    else:
        if commons_data_type == 'artwork':
            title_field = 'title'
        else:
            title_field = 'description'
        commons_title_list = json.loads(commons_data.loc[commons_data.filename == filename, title_field].values[0])
    #print(act_title, commons_title_list)
    
    work_language_strings = []
    build_string = ''
    language_code = 'en' # assume starting with English if no explicit tag
    # Examine all of the blobs of the Commons language list
    for blob in commons_title_list:
        found = False
        for language in languages:
            if language['string'] in blob:
                found = True
                found_code = language['code']
                break
        if found:
            # close off previous language string
            work_language_strings.append({'lang': language_code, 'title': build_string.strip()})
            # start new build string
            language_code = found_code
            build_string = ''
        else:
            if language_code == 'zh-Hans': # don't put spaces between Chinese characters
                build_string += blob
            else:
                if blob[0] == ',': # don't put a space before a comma
                    build_string += blob
                else:
                    build_string += ' ' + blob # otherwise, put spaces between the concatenated blobs
    # When through the entire list, finish off the last string
    work_language_strings.append({'lang': language_code, 'title': build_string.strip()})
    # If the very first blob was a language tag and nothing was added to the initial string, then delete it
    if work_language_strings[0]['title'] == '':
        work_language_strings = work_language_strings[1:]
            
    return work_language_strings

def process_type(act_id, act_type_string, commons_type_string, issue_log):
    types = [
        {'string': 'drawing', 'qid': 'Q93184'},
        {'string': 'painting', 'qid': 'Q3305213'},
        {'string': 'manuscript', 'qid': 'Q87167'},
        {'string': 'sculpture', 'qid': 'Q860861'},
        {'string': 'carving', 'qid': 'Q97570030'},
        {'string': 'textile', 'qid': 'Q28823'},
        {'string': 'garment', 'qid': 'Q11460'},
        {'string': 'photograph', 'qid': 'Q125191'},
        {'string': 'architecture', 'qid': 'Q811979'},
        {'string': 'mosaic', 'qid': 'Q133067'},
        {'string': 'fresco', 'qid': 'Q22669139'},
        {'string': 'print', 'qid': 'Q11060274'},
        {'string': 'mural', 'qid': 'Q219423'},
        {'string': 'watercolor', 'qid': 'Q3305213'}
         ]

    #print(act_type_string, commons_type_string)
    commons_type_list = json.loads(commons_type_string)
    qid = ''
    type_string = ''
    data_source = ''
    # Look for type in ACT data
    found = False
    for kind in types:
        if kind['string'] in act_type_string.lower():
            found = True
            type_string = kind['string']
            qid = kind['qid']
            data_source = 'act'
            break
    if not found:
        found = False
        for kind in types:
            for blob in commons_type_list:
                if kind['string'] in blob.lower():
                    found = True
                    type_string = kind['string']
                    qid = kind['qid']
                    data_source = 'commons'
                    break
    if not found:
        issue_log += act_id + ' | Could not identify type (instance of)\n'
        
    if type_string == 'architecture':
        type_string = 'architectural structure'
    
    return type_string, qid, issue_log, data_source



## Extract/clean inception date

The dates given in ACT and Commons can have several characteristics:

- beginning and ending ranges
- circa. Designated very consistently as "ca." in ACT. 
- century designation (essentially setting the precision to the century level)

The Commons data also sometimes is structured using Wikidata date Q IDs, qualifiers, and standard xsd:dateTime format:
- Starts with `QS:P571` (inception)
- Sometimes has `P1319` (earliest date) and `P1326` (latest date)
- Sometimes had `P580` (start time) and `P582` (end time)
- Sometimes has qualifier `P1480` (sourcing circumstances) with value `Q5727902` (circa)
- Date precisions can be 7 (century), 8 (decade), or 9 (year)


In [None]:
entire_issue_log = ''

# If there are any existing data in VanderBot format, load it (or just the file headers)
output_list = read_dicts_from_csv('abstract_artworks.csv')

labels_data = []

for index, work in act_data.iterrows():
    issue_log = ''
    output_dict = {}
    act_id = work['RecordNumber']
    act_url = 'http://diglib.library.vanderbilt.edu/act-imagelink.pl?RC=' + act_id
    
    # Look up the Commons page URL
    commons_url = ids.loc[ids.RecordNumber == act_id, 'commons_page_url'].values[0]
    
    # Look up the Commons filename in the IDs file using the ACT ID
    # This seems inefficient to have to look up the row twice, but I suppose Pandas is
    # efficient enough with a small dataset like this that it doesn't matter.
    filename = ids.loc[ids.RecordNumber == act_id, 'filename'].values[0]

    # Determine whether the information or artwork template was used, or if Commons data not available
    commons_date_type_series = commons_data.loc[commons_data.filename == filename, 'template_type']    
    if len(commons_date_type_series) == 1:
        commons_data_type = commons_date_type_series.values[0]
    else:
        commons_data_type = 'none'
    print('commons data type:', commons_data_type)
    output_dict['commons_template'] = commons_data_type
    
    # *** Process title information to generate English label ***
    
    act_title = work['Title']
    
    # Extract information about titles/labels from Commons data
    work_language_strings = process_labels(filename, commons_data_type, commons_data)
    
    # Attach the extracted language data to the list for later saving
    labels_data.append({'act_id': act_id, 'act_title': act_title, 'commons_data_type': commons_data_type, 'commons_language_strings': work_language_strings})
    
    # Add discovered label data to the output
    output_dict['label_en'] = act_title # Use the ACT title as the default
    
    commons_label = ''
    for work_language in work_language_strings:
        if work_language['lang'] == 'en':
            commons_label = work_language['title']
    output_dict['label_commons'] = commons_label
    
    # *** Other fields ***
    
    # Create ACT ID output
    output_dict['act'] = act_id
    output_dict['act_ref1_retrieved_val'] = today
    
    # Create Commons image output
    # VanderBot will convert the raw, unencoded file name into the appropriate IRI for Wikidata
    output_dict['image'] = filename
    output_dict['image_ref1_referenceUrl'] = act_url
    output_dict['image_ref1_retrieved_val'] = today
    
    # *** Process inception dates ***
    
    # Get the ACT date string value
    act_date_string = work['DateCreation']

    # Look up the date value in the Commons data
    if commons_data_type != 'none':
        commons_date_string = commons_data.loc[commons_data.filename == filename, 'date'].values[0]
    else:
        commons_date_string = '[]' # case where no commons data exists for this work

    #print(act_date_string, commons_date_string)
    
    output_dict, issue_log = process_dates(act_id, output_dict, issue_log, act_url, commons_url, act_date_string, commons_date_string)
    
    # *** Determine type (instance of)
    
    act_type_string = work['ObjectFunction']
    
    # Look up the object type value in the Commons data
    if commons_data_type != 'none':
        commons_type_string = commons_data.loc[commons_data.filename == filename, 'object type'].values[0]
    else:
        commons_type_string = '[]' # case where no commons data exists for this work
    type_string, instance_of_qid, issue_log, data_source = process_type(act_id, act_type_string, commons_type_string, issue_log)
    #print(type_string, instance_of_qid)
    
    output_dict['instance_of'] = instance_of_qid
    if data_source == 'act':
        output_dict['instance_of_ref1_referenceUrl'] = act_url
    elif data_source == 'commons':
        output_dict['instance_of_ref1_referenceUrl'] = commons_url
    else:
        output_dict['instance_of_ref1_referenceUrl'] = ''
        
    if data_source != '':
        output_dict['instance_of_ref1_retrieved_val'] = today
    else:
        output_dict['instance_of_ref1_retrieved_val'] = ''
    
    # *** next ***
    
    if issue_log != '': # include extra blank line only if some issues were added for this work
        issue_log += '\n'
    entire_issue_log += issue_log
    print()
    output_list.append(output_dict)
    
#print(entire_issue_log)
#print(json.dumps(labels_data, indent =2, ensure_ascii=False))


In [None]:
print(entire_issue_log)

In [None]:
print(json.dumps(output_list[1:], indent =2))

In [None]:
with open('language_labels.json', 'wt', encoding='utf8') as file_object:
    json.dump(labels_data, file_object, indent = 2, ensure_ascii=False)

In [None]:
# Eventually this should maybe be hard-coded. Currently, it's jury-rigged by having a dummy
# value in the first data row of the table to force the column headers to get picked up.
fieldnames = list(output_list[0].keys())

write_dicts_to_csv(output_list, 'abstract_artworks_out.csv', fieldnames)
print('done')