# Fine arts gallery data processing script

The script starts with a dump from jstor. The Excel file must then be saved as a CSV. NOTE: Save the CSV file with an `.txt` file extension (gallery_works0.txt) and don't open it without using the import routine where you can set the accession number to be text. Failure to do this will result in the loss of trailing zeros and item mismatches in the future. 

It is best to avoid opening the CSV files for manual editing and to just let the script do the work. The script reliably opens the file without corrupting the accession number.

## Function section

This needs to be run before any of the other cells

In [None]:
from pathlib import Path
import requests
from time import sleep
import json
import csv
import os
from fuzzywuzzy import fuzz # fuzzy logic matching
from copy import deepcopy
from langdetect import detect
from langdetect import detect_langs
import datetime

# ----------------
# Configuration settings
# ----------------

sparql_sleep = 0.1 # number of seconds to wait between queries to SPARQL endpoint
home = str(Path.home()) # gets path to home directory; supposed to work for both Win and Mac
endpoint = 'https://query.wikidata.org/sparql'
accept_media_type = 'application/json'

# Calculate the reference date retrieved value for all statements
whole_time_string_z = datetime.datetime.utcnow().isoformat() # form: 2019-12-05T15:35:04.959311
dateZ = whole_time_string_z.split('T')[0] # form 2019-12-05
ref_retrieved = dateZ + 'T00:00:00Z' # form 2019-12-05T00:00:00Z as provided by Wikidata, without leading +

# ----------------
# Utility functions
# ----------------

# Best to send a user-agent header because some Wikimedia servers don't like unidentified clients
def generate_header_dictionary(accept_media_type):
    user_agent_header = 'VanderBot/1.6 (https://github.com/HeardLibrary/linked-data/tree/master/vanderbot; mailto:steve.baskauf@vanderbilt.edu)'
    request_header_dictionary = {
        'Accept' : accept_media_type,
        'Content-Type': 'application/sparql-query',
        'User-Agent': user_agent_header
    }
    return request_header_dictionary

requestheader = generate_header_dictionary(accept_media_type)

# read from a CSV file into a list of dictionaries
def read_dict(filename):
    with open(filename, 'r', newline='', encoding='utf-8') as file_object:
        dict_object = csv.DictReader(file_object)
        array = []
        for row in dict_object:
            array.append(row)
    return array

# write a list of dictionaries to a CSV file
def write_dicts_to_csv(table, filename, fieldnames):
    with open(filename, 'w', newline='', encoding='utf-8') as csv_file_object:
        writer = csv.DictWriter(csv_file_object, fieldnames=fieldnames)
        writer.writeheader()
        for row in table:
            writer.writerow(row)

# find non-redundant values for a column or simple list
def non_redundant(table, column_key):
    non_redundant_list = []
    for row in table:
        found = False
        for test_item in non_redundant_list:
            if column_key == '':
                if row == test_item:
                    found = True
                    break
            else:
                if row[column_key] == test_item:
                    found = True
                    break
        if not found:
            if column_key == '':
                non_redundant_list.append(row)
            else:
                non_redundant_list.append(row[column_key])
    return non_redundant_list

# function to use in sort of simple list
def sort_funct(row):
    return row

# function to use in sort last_first names
def sort_last_first(row):
    return row['last_first']

# function to use in sort by match score
def sort_score(row):
    return row['score']

# extracts the qNumber from a Wikidata IRI
def extract_qnumber(iri):
    # pattern is http://www.wikidata.org/entity/Q6386232
    pieces = iri.split('/')
    return pieces[4]

# search label and alias
# For whatever reason, if I use the graph pattern

# wd:Q21 wdt:P31 ?class.

# England is not Q6256 (country)
# But if I use the graph pattern

#   wd:Q21 p:P31 ?statement.
#  ?statement ps:P31 ?class.

# it is ??!!
def searchLabelsAtWikidata(string, class_list):
    # create a string for the query
    query = 'select distinct ?id '
    query += '''where {
  {?id rdfs:label "''' + string + '''"@en.}
  union
  {?id skos:altLabel "''' + string + '''"@en.}
  '''
    for class_index in range(len(class_list)):
        if class_index == 0:
            query += '''{?id p:P31 ?statement.
  ?statement ps:P31 wd:''' + class_list[class_index] + '''.}
  '''
        else:
            query += '''union
  {?id p:P31 ?statement.
  ?statement ps:P31 wd:''' + class_list[class_index] + '''.}
  '''
    query += '''}'''
    #print(query)

    return_value = []
    # r = requests.get(endpointUrl, params={'query' : query}, headers=requestHeaderDictionary)
    r = requests.post(endpoint, data=query.encode('utf-8'), headers=generate_header_dictionary(accept_media_type))
    data = r.json()
    results = data['results']['bindings']
    for result in results:
        qid = extract_qnumber(result['id']['value'])
        return_value.append(qid)

    # delay a quarter second to avoid hitting the SPARQL endpoint too rapidly
    sleep(sparql_sleep)
    
    return return_value

def retrieve_gallery_classes():
    # create a string for the query
    # use Metropolitan Museum of Art because there are too many collections to not specify the collection.
    query = '''select distinct ?class ?label where 
      {
      ?item wdt:P195 wd:Q160236.
      ?item wdt:P31 ?class.
      ?class rdfs:label ?label.
      filter(lang(?label) = 'en')
      }
      order by ?label'''

    #print(query)

    return_value = []
    print('sending query')
    r = requests.post(endpoint, data=query.encode('utf-8'), headers=generate_header_dictionary(accept_media_type))
    print('results returned')
    data = r.json()
    results = data['results']['bindings']
    for result in results:
        qid = extract_qnumber(result['class']['value'])
        label = result['label']['value']
        return_value.append({'label': label, 'qid': qid})

    # delay a quarter second to avoid hitting the SPARQL endpoint too rapidly
    sleep(sparql_sleep)
    
    return return_value

def generateNameAlternatives(name):
    # treat commas as if they were spaces
    name = name.replace(',', ' ')
    # get rid of periods
    name = name.replace('.', '')

    pieces = name.split(' ')
    
    # Remove ", Jr.", "III", etc. from end of name
    if pieces[len(pieces)-1] == 'Jr':
        pieces = pieces[0:len(pieces)-1]
        suffix = ', Jr.'
    elif pieces[len(pieces)-1] == 'II':
        pieces = pieces[0:len(pieces)-1]
        suffix = ' II'
    elif pieces[len(pieces)-1] == 'III':
        pieces = pieces[0:len(pieces)-1]
        suffix = ' III'
    elif pieces[len(pieces)-1] == 'IV':
        pieces = pieces[0:len(pieces)-1]
        suffix = ' IV'
    elif pieces[len(pieces)-1] == 'V':
        pieces = pieces[0:len(pieces)-1]
        suffix = ' V'
    elif len(pieces) > 3 and pieces[len(pieces)-2] == 'the' and pieces[len(pieces)-1] == 'elder':
        pieces = pieces[0:len(pieces)-2]
        suffix = ' the elder'
    else:
        suffix = ''

    # generate initials for all names
    initials = []
    for piece in pieces:
        # make sure first character is alphabetic
        # only fixes the case where there is one alphanumeric, but more than one is rare
        # typical cases are like (Kit) or "Kit"
        if not piece[0:1].isalpha():
            piece = piece[1:len(piece)] # remove the first non-alphabetic character
        if len(piece) > 0:
            initials.append(piece[0:1])
        
    alternatives = []
    # full name
    nameVersion = ''
    for pieceNumber in range(0, len(pieces)-1):
        nameVersion += pieces[pieceNumber] + ' '
    nameVersion += pieces[len(pieces)-1]
    alternatives.append(nameVersion)
    
    # full name with suffix
    if suffix != '':
        nameVersion = ''
        for pieceNumber in range(0, len(pieces)-1):
            nameVersion += pieces[pieceNumber] + ' '
        nameVersion += pieces[len(pieces)-1] + suffix
        alternatives.append(nameVersion)
    
    # first and last name with initials
    nameVersion = pieces[0] + ' '
    for pieceNumber in range(1, len(pieces)-1):
        nameVersion += initials[pieceNumber] + ' '
    nameVersion += pieces[len(pieces)-1]
    alternatives.append(nameVersion)
    
    # first and last name with initials and periods
    nameVersion = pieces[0] + ' '
    for pieceNumber in range(1, len(pieces)-1):
        nameVersion += initials[pieceNumber] + '. '
    nameVersion += pieces[len(pieces)-1]
    alternatives.append(nameVersion)

    # first and last name only
    nameVersion = pieces[0] + ' '
    nameVersion += pieces[len(pieces)-1]
    alternatives.append(nameVersion)

    # first initial and last name only
    nameVersion = initials[0] + ' '
    nameVersion += pieces[len(pieces)-1]
    alternatives.append(nameVersion)

    # first initial with period and last name only
    nameVersion = initials[0] + '. '
    nameVersion += pieces[len(pieces)-1]
    alternatives.append(nameVersion)

    # all name initials with last name
    nameVersion = initials[0] + ' '
    for pieceNumber in range(1, len(pieces)-1):
        nameVersion += initials[pieceNumber] + ' '
    nameVersion += pieces[len(pieces)-1]
    alternatives.append(nameVersion)

    # all name initials with periods with last name
    nameVersion = ''
    for pieceNumber in range(0, len(pieces)-1):
        nameVersion += initials[pieceNumber] + '. '
    nameVersion += pieces[len(pieces)-1]
    alternatives.append(nameVersion)

    # all name initials concatenated with last name
    nameVersion = ''
    for pieceNumber in range(0, len(pieces)-1):
        nameVersion += initials[pieceNumber]
    nameVersion += ' ' + pieces[len(pieces)-1]
    alternatives.append(nameVersion)
    
    # remove duplicates
    dedupe = list(set(alternatives))

    return dedupe

def searchNameAtWikidata(name):
    nameList = generateNameAlternatives(name)
    alternatives = ''
    for alternative in nameList:
        # get rid of quotes, which will break the query
        alternative = alternative.replace('"', '')
        alternative = alternative.replace("'", '')
        alternatives += '"' + alternative + '"@en\n'
    query = '''
select distinct ?item ?label where {
  VALUES ?value
  {
  ''' + alternatives + '''}
?item rdfs:label|skos:altLabel ?value.
?item rdfs:label ?label.
FILTER(lang(?label)='en')
  }
'''
    #print(query)
    #print('searching for ', name)
    results = []
    # r = requests.get(wikidataEndpointUrl, params={'query' : query}, headers=requestHeaderDictionary)
    r = requests.post(endpoint, data=query.encode('utf-8'), headers=requestheader)
    try:
        data = r.json()
        statements = data['results']['bindings']
        for statement in statements:
            wikidataIri = statement['item']['value']
            if 'label' in statement:
                name = statement['label']['value']
            else:
                name = ''
            qNumber = vbc.extract_qnumber(wikidataIri)
            results.append({'qId': qNumber, 'name': name})
    except:
        results = [{'error': r.text}]
    # delay a quarter second to avoid hitting the SPARQL endpoint too rapidly
    sleep(sparql_sleep)
    return results

def name_variant_testing(name, variant):
    # get rid of periods
    name = name.replace('.','')
    variant = variant.replace('.','')
    
    # create first names
    name_pieces = name.split(' ')
    variant_pieces = variant.split(' ')
    last_name = name_pieces[len(name_pieces)-1]
    last_variant = variant_pieces[len(variant_pieces)-1]
    if len(name_pieces) > 1:
        first_names = name[0:-(len(last_name)+1)]
    else:
        first_names = name     
    if len(variant_pieces) > 1:
        first_variants = variant[0:-(len(last_variant)+1)]
    else:
        first_variants = variant      
    #print(first_names)
    #print(first_variants)
    
    # compare first names
    # I experimented with the different ratios and I think fuzz might be best.
    ratio = fuzz.ratio(first_names, first_variants)
    #partial_ratio = fuzz.partial_ratio(first_names, first_variants)
    #sort_ratio = fuzz.token_sort_ratio(first_names, first_variants)
    #set_ratio = fuzz.token_set_ratio(first_names, first_variants)
    # print('name similarity ratio', ratio)
    #print('partial ratio', partial_ratio)
    #print('sort_ratio', sort_ratio)
    #print('set_ratio', set_ratio)

    return(ratio)

def find_surname_givens(name):
    # Get rid of periods and commas
    name = name.replace('.', '')
    name = name.replace(',', '')
    
    # Split name
    pieces = name.split(' ')
    # Must be at least a surname and something else
    if len(pieces) <= 1:
        return False
    
    # Make sure first character is alphabetic
    # only fixes the case where there is one alphanumeric, but more than one is rare
    # typical cases are like (Kit) or "Kit"    
    for piece_index in range(len(pieces)):
        if not pieces[piece_index][0:1].isalpha(): 
            pieces[piece_index] = pieces[piece_index][1:len(pieces)] # remove the first non-alphabetic character
    # Now get rid of any empty strings; could also be caused by double spaces
    for piece in pieces:
        if len(piece) == 0: # there's nothing left, get rid of piece
            pieces.remove('')
            
    # Get rid of ", Jr.", "III", etc.
    if 'Jr' in pieces:
        pieces.remove('Jr')
    if 'Sr' in pieces:
        pieces.remove('Sr')
    if 'II' in pieces:
        pieces.remove('II')
    if 'III' in pieces:
        pieces.remove('III')
    if 'IV' in pieces:
        pieces.remove('IV')
    if 'V' in pieces:
        pieces.remove('V')
    
    # Not interested unless there are at least two pieces
    if len(pieces) == 1:
        return False
    
    # Put all but last piece together again
    given_names = ''
    for piece in pieces[0:len(pieces)-2]:
        given_names += piece + ' '
    given_names += pieces[len(pieces)-2]
    
    return {'given': given_names, 'family': pieces[len(pieces)-1]}

def remove_parens(string):
    name_string = string.split('(')[0]
    return name_string.strip()

def remove_description(string):
    try:
        right_string = string.split('(')[1]
        left_string = right_string.split(')')[0]
        result = left_string.strip()
    except:
        result = ''
    return result

def reverse_names(string):
    pieces = string.split(',')
    return pieces[1].strip() + ' ' + pieces[0].strip()

# Screens for Wikidata items that are potential matches

import vb_common_code as vbc
retrieve_class_list_query = vbc.Query(pid='P31', uselabel=False, sleep=sparql_sleep)
retrieve_birth_date_query = vbc.Query(isitem=False, pid='P569', sleep=sparql_sleep)
retrieve_death_date_query = vbc.Query(isitem=False, pid='P570', sleep=sparql_sleep)

def human(qId):
    screen = True
    wdClassList = retrieve_class_list_query.single_property_values_for_item(qId)
    # if there is a class property, check if it's a human
    if len(wdClassList) != 0:
        # if it's not a human
        if wdClassList[0] != 'Q5':
            #print('*** This item is not a human!')
            screen = False
    return screen

# returns a dictionary of various descriptors of the item with Wikidata ID qId
# P106 is occupation, schema:description is filtered to be the English description
def searchWikidataDescription(qId):
    resultsDict = {}
    query = '''select distinct ?description ?orcid ?occupation where {
        optional {
            wd:'''+ qId + ''' schema:description ?description.
            FILTER(lang(?description) = 'en')
            }
        optional {
            wd:'''+ qId + ''' wdt:P106 ?occupationId.
            ?occupationId rdfs:label ?occupation.
            FILTER(lang(?occupation) = 'en')            
            }
        optional {wd:'''+ qId + ''' wdt:P496 ?orcid.}
      }'''
    #print(query)
    r = requests.post(endpoint, data=query.encode('utf-8'), headers=requestheader)
    try:
        data = r.json()
        statements = data['results']['bindings']
        if len(statements) > 0: # if no results, the dictionary remains empty
            # Only a single description per language is allowed, so there should only be one description
            if 'description' in statements[0]:
                description = statements[0]['description']['value']
            else:
                description = ''
            resultsDict['description'] = description
            
            # Only a single ORCID is allowed, so there should only be one orcid value
            if 'orcid' in statements[0]:
                orcid = statements[0]['orcid']['value']
            else:
                orcid = ''
            resultsDict['orcid'] = orcid
            
            # if there are multiple statements, that's because there are more than one occupation
            occupationList = []
            for statement in statements:
                if 'occupation' in statement:
                    occupationList.append(statement['occupation']['value'])
            resultsDict['occupation'] = occupationList
    except:
        resultsDict = {'error': r.text}
    # delay a quarter second to avoid hitting the SPARQL endpoint too rapidly
    sleep(sparql_sleep)
    return resultsDict

def determine_era(string):
    # dates with no CE or BCE, including empty string
    if 'CE' not in string:
        value = string
        era = 'unknown'      
    else:
        if 'BCE' in string:
            value = string[0:len(string)-3].strip()
            era = 'BCE'
        else: # string ends with CE
            value = string[0:len(string)-2].strip()
            era = 'CE'
    return value, era

def determine_zeros(date):
    zero_count = 0
    for char_number in range(len(date), 0, -1):
        if date[char_number-1] == '0':
            zero_count += 1
        else:
            return zero_count

def pad_zeros_left(date_string):
    length = len(date_string)
    pad = 4-length
    return '0' * pad + date_string

def sign(era):
    if era == 'BCE':
        return '-'
    elif era == 'CE':
        return ''
    else:
        return ''

def detect_language(string):
    try:
        lang_list = detect_langs(string)
        lang_string = str(lang_list[0])
        confidence = float(lang_string[3:])
        lang = lang_string[:2]
    except: #exceptions occur when no info to decide, e.g. numbers
        lang = 'zxx'
        confidence = float(0)
    return lang, confidence

# Initial file processing

NOTE: When opening the files, be sure to pay attention to the file import dialog. That allows the accession number column to be imported as a string rather than as a number. Importing as a number causes trailing zeros to be dropped.

This section simplifies the column headers and writes a copy of the data to a CSV

In [None]:
filename = 'gallery_works0.csv'
old_works = read_dict(filename)
fieldnames = list(old_works[0].keys())

# print(fieldnames)

ssid_field_name = 'SSID'
# need to compensate for Bit Order Mark (BOM) from first character of header.
if fieldnames[0][0] == '\ufeff':
    ssid_field_name = '\ufeff' + ssid_field_name
works = []
for old_work in old_works:
    work = {}
    work['ssid'] = old_work[ssid_field_name]
    work['filename'] = old_work['Filename']
    title = old_work['Title[637073]'].strip()
    title = title.replace('\n', ' ') # replace embedded hard returns with spaces.
    work['title'] = title
    creator = old_work['Creator[637071]'].strip()
    creator = creator.replace('\n', ' ') # replace embedded hard returns with spaces.
    work['creator_string'] = creator
    work['date'] = old_work['Date[637076]']
    work['classification'] = old_work['Classification[637103]']
    work['medium'] = old_work['Medium[637080]']
    work['measurements'] = old_work['Measurements[637081]']
    work['style_period'] = old_work['Style/Period[637079]']
    work['country_culture'] = old_work['Country/Culture[637072]']
    work['seals_inscriptions'] = old_work['Seals & Inscriptions[637104]'].strip()
    work['signature'] = old_work['Signature[637105]']
    work['description'] = old_work['Description[637092]']
    work['publications'] = old_work['Publications[637106]']
    work['exhibitions'] = old_work['Exhibitions[637107]']
    work['accession_number'] = old_work['Accession Number[637085]'].strip()
    work['date_acquired'] = old_work['Date Acquired[637109]']
    work['gift_of'] = old_work['Gift of[637110]']
    work['purchased_from'] = old_work['Purchased from[637111]']
    work['credit_line'] = old_work['Credit Line[637112]']
    work['provenance'] = old_work['Provenance[637113]']
    work['collection'] = old_work['Collection[637114]']
    work['last_change'] = old_work['Last Change[637115]']
    work['notes'] = old_work['Notes[637116]']
    work['rights'] = old_work['Rights[637099]']
    work['media_url'] = old_work['Media URL']
    # License and the Artstor fields are not populated
        
    works.append(work)
        
fieldnames = list(works[0].keys())
write_dicts_to_csv(works, 'gallery_works_renamed1.csv', fieldnames)

print('done')

# Parse dimensions data

This processes the values of the measurements column and separates them into length and width or length/width/height quantities.

In [None]:
for work_index in range(len(works)):
#for work_index in range(0,1):
    string = works[work_index]['measurements'].strip()
    if string == '':
        # no value; set all variables to empty string
        height = ''
        width = ''
        depth = ''
        diameter = ''
    elif ' x ' not in string:
        # one dimensional or improperly formatted
        pieces = string.split(' ')
        try:
            value = float(pieces[0])
            if pieces[1] != 'in.':
                # second part of string not "in."
                print(works[work_index]['accession_number'], string)
                height = ''
                width = ''
                depth = ''
                diameter = ''
            else:
                height = ''
                width = ''
                depth = ''
                diameter = str(value)
        except:
            # improperly formatted            
            # set all variables to empty string
            print(works[work_index]['accession_number'], string)
            height = ''
            width = ''
            depth = ''
            diameter = ''
    else:
        # the string has an x in it, so it's multidimensional
        pieces = string.split('x')
        # split the string and get rid of leading and trailing whitespace
        for piece_index in range(len(pieces)):
            pieces[piece_index] = pieces[piece_index].strip()
        # remove the "in." and any spaces from the last piece
        pieces[len(pieces)-1] = pieces[len(pieces)-1].split('in')[0].strip()
        if len(pieces) == 2:
            # two-dimensional work
            try:
                height = str(float(pieces[0]))
                width = str(float(pieces[1]))
                depth = ''
                diameter = ''
            except:
                print(works[work_index]['accession_number'], string)
                height = ''
                width = ''
                depth = ''
                diameter = ''                
        else:
            # three-dimensional work
            try:
                height = str(float(pieces[0]))
                width = str(float(pieces[1]))
                depth = str(float(pieces[2]))
                diameter = ''
            except:
                print(works[work_index]['accession_number'], string)
                height = ''
                width = ''
                depth = ''
                diameter = ''
    works[work_index]['height'] = height
    works[work_index]['width'] = width
    works[work_index]['depth'] = depth
    works[work_index]['diameter'] = diameter

fieldnames = list(works[0].keys())
write_dicts_to_csv(works, 'gallery_works_with_dim2.csv', fieldnames)

print('done')


# Parse inception dates

In [None]:
filename = 'gallery_works_with_dim2.csv'
works = read_dict(filename)
# fieldnames = list(works[0].keys())
# print(fieldnames)

for work_index in range(len(works)):
    string = works[work_index]['date']
    
    # handle ideosyncratic date values
    if string == 'not dated':
        string = ''
    if string == 'Unknown':
        string = ''
    if string[0:5] == 'late ':
        string = string[5:]
    if string[0:4] == 'mid ':
        string = string[4:]
    # drop parenthetical comments after the dates
    if '(' in string:
        pieces = string.split('(')
        string = pieces[0].strip()
    # fix bad century designation
    if 'th CE' in string:
        pieces = string.split('th CE')
        string = pieces[0] + 'th century CE'
        

    # split dates
    date_list = ['', '', ''] # 0 is single date, 1 is beginning of range, 2 is end of range
    if '-' in string:
        date_list[0] = ''
        date_list[1] = string.split('-')[0].strip()
        date_list[2] = string.split('-')[1].strip()
    elif 'to' in string:
        date_list[0] = ''
        date_list[1] = string.split('to')[0].strip()
        date_list[2] = string.split('to')[1].strip()
        # handle special case of "late ... to early ..."
        if date_list[2][0:6] == 'early ':
            date_list[2] = date_list[2][6:]
    else:
        date_list[0] = string.strip()
        date_list[1] = ''
        date_list[2] = ''

    # extract CE and BCE
    for date_index in range(len(date_list)):
        date_dict = {}
        date_dict['value'], date_dict['era'] = determine_era(date_list[date_index])
        date_list[date_index] = date_dict
    
    # If last date in range has a designation and the first one doesn't, assign it to the first date.
    if date_list[1]['value'] != '' and date_list[1]['era'] == 'unknown' and date_list[2]['era'] != 'unknown':
        date_list[1]['era'] = date_list[2]['era']
        
    # For dates with no specified era, assign CE
    for date_index in range(len(date_list)):
        if date_list[date_index]['value'] != '' and date_list[date_index]['era'] == 'unknown':
            date_list[date_index]['era'] = 'CE'
    
    # Create a date dict to hold more information about the date format
    date_dict = {'dates': date_list}
    
    # Determine if date is circa
    date_dict['circa'] = False
    for date_index in range(len(date_dict['dates'])):
        if date_list[date_index]['value'][0:3] == 'ca.':
            date_dict['circa'] = True
            date_dict['dates'][date_index]['value'] = date_dict['dates'][date_index]['value'][3:].strip()
    
    # Determine if values are centuries
    date_dict['century'] = False
    for date_index in range(len(date_dict['dates'])):
        if date_dict['dates'][date_index]['value'][-7:] == 'century':
            date_dict['century'] = True
            date_dict['dates'][date_index]['value'] = date_dict['dates'][date_index]['value'][:-7].strip()
    if date_dict['century']: # if determined to be century values, strip off the "th"
        for date_index in range(len(date_dict['dates'])):
            date_dict['dates'][date_index]['value'] = date_dict['dates'][date_index]['value'][:-2]
    # append date dict to works
    works[work_index]['inception'] = date_dict

# check for bad dates
print('Dates with problems that need to be fixed manually')
for work_index in range(len(works)):
    for date in works[work_index]['inception']['dates']:
        if date['value'] != '':
            try:
                junk = int(date['value'])
            except:
                print(work_index, works[work_index]['date'])
    # check for two-digit second numbers in ranges
    if works[work_index]['inception']['dates'][1]['value'] != '' and works[work_index]['inception']['dates'][1]['era'] == 'CE':
        if int(works[work_index]['inception']['dates'][1]['value']) > int(works[work_index]['inception']['dates'][2]['value']):
            print(work_index, works[work_index]['accession_number'], works[work_index]['date'])
        
# Process dates into form needed by Wikidata
for work_index in range(len(works)):
#for work_index in range(325, 330):
    if works[work_index]['inception']['dates'][1]['value'] != '': # cases with date ranges
        # Average ranges
        first = works[work_index]['inception']['dates'][1]['value']
        first_era = works[work_index]['inception']['dates'][1]['era']
        second = works[work_index]['inception']['dates'][2]['value']
        second_era = works[work_index]['inception']['dates'][2]['era']
        if works[work_index]['inception']['century']:
            first = str(int(first) * 100)
            second = str(int(second) * 100)
        minimum_zeros = min(determine_zeros(first), determine_zeros(second))
        factor = 10**minimum_zeros
        average = (float(sign(first_era) + first) + float(sign(second_era) + second))/2
        if minimum_zeros < 2:
            # for years and decades, round to the nearest year
            average = int(average +.5)
            works[work_index]['inception_prec'] = '9' # precision to year
        else:
            if works[work_index]['inception']['century']: # date given in centuries
                if int(second) - int(first) == 100:
                    # if given as "xth to (x+1)th century" then use the year between
                    average = (int(sign(first_era) + '1') * (int(first) - 100) + float(sign(second_era) + second))/2
                    works[work_index]['inception_prec'] = '9' # precision to year
                elif int(second) - int(first) == 200:
                    # if given as "xth to (x+2)th century" then use the century x+1 between
                    average = int(average/factor)*factor
                    works[work_index]['inception_prec'] = '7' # precision to century
                else:
                    # for wider ranges, just give the average year
                    average = (int(sign(first_era) + '1') * (int(first) - 100) + float(sign(second_era) + second))/2
                    works[work_index]['inception_prec'] = '9' # precision to year
            else: # date give in year range
                if int(second) - int(first) == 100:
                    # if given as "x00 to (x+1)00" then use the x+1 century. This is good for cases like "1400-1500"
                    average = int(average/factor + 1)*factor
                    works[work_index]['inception_prec'] = '7' # precision to century
                else:
                    # for ranges like "x00-(x+2)00" then use the year in the middle: (x+1)00
                    works[work_index]['inception_prec'] = '9' # precision to year
        # remove negative sign
        average = int(average) # remove any decimals and trailing zeros from the number
        if average < 0:
            number_string = str(average)[1:]
            sign_string = '-'
        else: # positive dates aren't stored with signs, they are added by the upload script
            number_string = str(average)
            sign_string = ''
        works[work_index]['inception_val'] = sign_string + pad_zeros_left(number_string) + '-01-01T00:00:00Z'
        
        # Now set the earliest and latest date values
        if works[work_index]['inception']['century']: # date given in centuries
            works[work_index]['earliest_date_val'] = sign(works[work_index]['inception']['dates'][1]['era']) + pad_zeros_left(works[work_index]['inception']['dates'][1]['value'] + '00') + '-01-01T00:00:00Z'
            works[work_index]['earliest_date_prec'] = '7' # precision to century
            works[work_index]['latest_date_val'] = sign(works[work_index]['inception']['dates'][2]['era']) + pad_zeros_left(works[work_index]['inception']['dates'][2]['value'] + '00') + '-01-01T00:00:00Z'
            works[work_index]['latest_date_prec'] = '7' # precision to century
        else: # date given in years
            works[work_index]['earliest_date_val'] = sign(works[work_index]['inception']['dates'][1]['era']) + pad_zeros_left(works[work_index]['inception']['dates'][1]['value']) + '-01-01T00:00:00Z'
            works[work_index]['earliest_date_prec'] = '9' # precision to year
            works[work_index]['latest_date_val'] = sign(works[work_index]['inception']['dates'][2]['era']) + pad_zeros_left(works[work_index]['inception']['dates'][2]['value']) + '-01-01T00:00:00Z'
            works[work_index]['latest_date_prec'] = '9' # precision to year
    else: # cases without date ranges
        if works[work_index]['inception']['dates'][0]['value'] =='':
            works[work_index]['inception_val'] = ''
            works[work_index]['inception_prec'] = ''
        else:
            if works[work_index]['inception']['century']: # date given in centuries
                works[work_index]['inception_val'] = sign(works[work_index]['inception']['dates'][0]['era']) + pad_zeros_left(works[work_index]['inception']['dates'][0]['value'] + '00') + '-01-01T00:00:00Z'
                works[work_index]['inception_prec'] = '7' # precision to century
            else: # date given in years
                works[work_index]['inception_val'] = sign(works[work_index]['inception']['dates'][0]['era']) + pad_zeros_left(works[work_index]['inception']['dates'][0]['value']) + '-01-01T00:00:00Z'
                works[work_index]['inception_prec'] = '9' # precision to year
        
        works[work_index]['earliest_date_val'] = ''
        works[work_index]['earliest_date_prec'] = ''
        works[work_index]['latest_date_val'] = ''
        works[work_index]['latest_date_prec'] = ''
    
    # add statement for sourcing circumstances qualifier P1480 when "circa" (Q5727902)
    if works[work_index]['inception']['circa']:
        works[work_index]['sourcing_circumstances'] = 'Q5727902'
    else:
        works[work_index]['sourcing_circumstances'] = ''
    
    '''  
    print('date:', works[work_index]['date'])
    print('inception:', works[work_index]['inception_val'], works[work_index]['inception_prec'])            
    print('earliest date:', works[work_index]['earliest_date_val'], works[work_index]['earliest_date_prec'])
    print('latest date:', works[work_index]['latest_date_val'], works[work_index]['latest_date_prec'])
    print()
    '''

fieldnames = list(works[0].keys())
write_dicts_to_csv(works, 'gallery_works_with_dates3.csv', fieldnames)

'''
# output test table for dates for testing, not needed since script is now working.
out_table = []
for work in works:
    out_dict = {}
    out_dict['string'] = work['date']
    out_dict['singe_date'] = work['inception']['dates'][0]['value']
    out_dict['singe_era'] = work['inception']['dates'][0]['era']
    out_dict['first_date'] = work['inception']['dates'][1]['value']
    out_dict['first_era'] = work['inception']['dates'][1]['era']
    out_dict['second_date'] = work['inception']['dates'][2]['value']
    out_dict['second_era'] = work['inception']['dates'][2]['era']
    out_dict['circa'] = work['inception']['circa']
    out_dict['century'] = work['inception']['century']
    out_table.append(out_dict)

fieldnames = list(out_table[0].keys())
write_dicts_to_csv(out_table, 'test_dates.csv', fieldnames)
'''

print('done')


# Clean up acquisition dates

This script cleans up the date acquired field. These are mostly years, but there are a few non-year values. However, for old works, the date is when it was acquired by the Peabody gallery. So the actual values for VU should be taken from the accession number. 

In [None]:
filename = 'gallery_works_with_dates3.csv'
works = read_dict(filename)

# This section was used to develop the cleanup routine with a non-redundant list
'''
for work_index in range(len(works)):
    if works[work_index]['date_acquired'].strip() != '':
        works[work_index]['date_acquired'] = works[work_index]['date_acquired'].strip()
acquisition_dates = non_redundant(works, 'date_acquired')
acquisition_dates.sort(key = sort_funct)
'''

for work_index in range(len(works)):
    if works[work_index]['date_acquired'].strip() != '':
        date = works[work_index]['date_acquired'].strip()
    else:
        date = ''

    # remove commas
    date = date.replace(',', '')
    if '/' in date:
        pieces = date.split('/')
    else:
        pieces = date.split(' ')
    year = ''
    circa = False
    for piece in pieces:
        if piece == 'ca.':
            circa = True
        try:
            number = int(piece)
            if number > 1000:
                year = str(number)
        except:
            pass
    if year != '':
        works[work_index]['acquired_cleaned'] = year
        
    # adate['circa'] = circa # don't really know what to do with circa, not an appropriae qualifier for collection
    year = works[work_index]['accession_number'][0:4]

    if works[work_index]['accession_number'] != '':
        year = works[work_index]['accession_number'][0:4]
        works[work_index]['collection_start_time_val'] = year + '-01-01T00:00:00Z'
        works[work_index]['collection_start_time_prec'] = '9'
    else:
        works[work_index]['collection_start_time_val'] = ''
        works[work_index]['collection_start_time_prec'] = ''
        
fieldnames = list(works[0].keys())
write_dicts_to_csv(works, 'gallery_works_acquisition4.csv', fieldnames)

print('done')

# Build output file for Wikidata upload

The data here needs to be reconciled against data already downloaded from Wikidata using another script. Those downloaded data are in the `works_multiprop.csv` file. The data generated here will be added to that file.

In [None]:
# open source files

filename = 'works_multiprop.csv'
items = read_dict(filename)
fieldnames = list(items[0].keys())
# print(fieldnames)

data_columns = []
for fieldname in fieldnames:
    if '_ref1_referenceUrl' in fieldname:
        data_column = {}
        data_column['prefix'] = fieldname[:len(fieldname)-18]
        if data_column['prefix'] in fieldnames:
            data_column['name'] = data_column['prefix']
        else: # value node values don't have a column name that is the prefix
            data_column['name'] = data_column['prefix'] + '_val'
        data_columns.append(data_column)
#print(data_columns)

filename = 'gallery_works_acquisition4.csv'
works = read_dict(filename)

# fieldnames = list(works[0].keys())
# print(fieldnames)

filename = 'creators.csv'
creators_raw = read_dict(filename)
creators = []
for creator in creators_raw:
    strings = json.loads(creator['creator_string'])
    for string in strings:
        creators.append({'qid': creator['qid'], 'name': creator['name'], 'string': string})

filename = 'classification_mappings.csv'
classifications = read_dict(filename)

filename = 'country_mappings.csv'
countries = read_dict(filename)

filename = 'creator-multi.csv'
creator2 = read_dict(filename)

# set up error logs
missing_creators = []
missing_classifications = []
missing_countries = []

output = deepcopy(items)

count = 0
for work in works:
    if count % 1000 == 0:
        print(count)
    dic = {}
    
    # Screening section. Several screens are applied to suppress a record from being written
    suppressed = False
    
    # Prevent existing items from being modified by data from Artstor
    found = False
    for item in items:
        if item['inventory_number'] == work['accession_number']:
            found = True
            break
    # If the accession number of the work matches an existing item, at this point we will just skip it.
    # At some point in the future, we would want to check for corrected or updated information.
    if found:
        suppressed = True
    
    # Suppress works with labels that are too long
    if len(work['title']) > 150:
        suppressed = True
    
    # Supress works that aren't classified with a genre since they won't have instance_of
    if work['classification'] == '':
        suppressed = True
    
    # For now, suppress portrait aspect works
    if work['height'] != '' and float(work['height']) > float(work['width']):
        suppressed = True
    
    # For now, suppress all works with "diameters" because a lot of them are lengths
    if work['diameter'] != '':
        suppressed = True
        
    dic['inventory_number'] = work['accession_number']

        # For now, suppress all works with missing creators
    found = False
    for creator in creators:
        if work['creator_string'] == creator['string']:
            artist_name = creator['name']
            dic['creator'] = creator['qid']
            found = True
            break
    if not found:
        suppressed = True
        missing_creators.append({'inventory_number': "#" + dic['inventory_number'], 'string': work['creator_string']})
        artist_name = ''
        dic['creator'] = ''
    
    if creator['qid'] == '':
        suppressed = True
    
    # Generate the lines for all non-suppressed works
    if not suppressed:
        dic['inventory_number_collection'] = 'Q18563658' # Fine Arts Gallery
        dic['label_en'] = work['title']
        
        # title
        # The title column is hard-coded as English, so suspected non-English titles 
        # should be suppressed
        lang, prec = detect_language(work['title'])
        if lang == 'en' and prec > 0.99:
            dic['title'] = work['title']
        else:
            dic['title'] = ''
        dic['title_subtitle'] = ''
        
        # find second creator if there is one; only used for the description. 2nd creator item will be added
        # as part of a second table
        found = False
        for creator in creator2:
            if work['creator_string'] == json.loads(creator['creator_string'])[0]:
                second_artist_name = creator['name']
                found = True
                break
        if not found:
            missing_creators.append({'inventory_number': "#" + dic['inventory_number'], 'string': work['creator_string']})
            second_artist_name = ''
        
        # instance of
        found = False
        for classification in classifications:
            if work['classification'] == classification['string']:
                genre_string = classification['label']
                dic['instance_of'] = classification['qid']
                found = True
                break
        if not found:
            missing_classifications.append({'inventory_number': "#" + dic['inventory_number'], 'string': work['classification']})
            genre_string = ''
            dic['instance_of'] = ''
            
        if second_artist_name == '':
            if 'attributed to' in artist_name:
                dic['description_en'] = genre_string + artist_name
            else:
                dic['description_en'] = genre_string + ' by ' + artist_name
        else:
            dic['description_en'] = genre_string + ' by ' + artist_name + ' and ' + second_artist_name
        dic['inception_val'] = work['inception_val']
        dic['inception_prec'] = work['inception_prec']
        dic['inception_earliest_date_val'] = work['earliest_date_val']
        dic['inception_earliest_date_prec'] = work['earliest_date_prec']
        dic['inception_latest_date_val'] = work['latest_date_val']
        dic['inception_latest_date_prec'] = work['latest_date_prec']
        dic['inception_sourcing_circumstances'] = work['sourcing_circumstances']
        
        # country
        found = False
        for country in countries:
            if work['country_culture'] == country['string']:
                country_culture = work['country_culture']
                dic['country'] = country['qid']
                dic['country_of_origin'] = country['qid']
                found = True
                break
        if not found:
            missing_countries.append({'inventory_number': "#" + dic['inventory_number'], 'string': work['country_culture']})
            country_culture = ''
            dic['country'] = ''
            dic['country_of_origin'] = ''
        
        dic['height_val'] = work['height']
        if work['height'] != '':
            dic['height_unit'] = 'Q218593'
        else:
            dic['height_unit'] = ''
            
        dic['width_val'] = work['width']
        if work['width'] != '':
            dic['width_unit'] = 'Q218593'
        else:
            dic['width_unit'] = ''
            
        dic['thickness_val'] = work['depth']
        if work['depth'] != '':
            dic['thickness_unit'] = 'Q218593'
        else:
            dic['thickness_unit'] = ''
        
        dic['diameter_val'] = work['diameter']
        if work['diameter'] != '':
            dic['diameter_unit'] = 'Q218593'
        else:
            dic['diameter_unit'] = ''
        
        dic['collection'] = 'Q18563658' # Fine Arts Gallery
        # !!! Can we get this from the inventory number if missing?
        dic['collection_start_time_val'] = work['collection_start_time_val']
        dic['collection_start_time_prec'] = work['collection_start_time_prec']
        dic['location'] = 'Q18563658' # Vanderbilt University Fine Arts Gallery

        # generate references
        for column in data_columns:
            try: # some columns are passed through without values and will generate errors
                if dic[column['name']] != '':
                    dic[column['prefix'] + '_ref1_referenceUrl'] = 'https://library.artstor.org/#/asset/' + work['ssid']
                    dic[column['prefix'] + '_ref1_retrieved_val'] =  ref_retrieved
                    dic[column['prefix'] + '_ref1_retrieved_prec'] =  '11'
            except:
                pass
            
        output.append(dic)
    count += 1

# output data
fieldnames = list(output[0].keys())
write_dicts_to_csv(output, 'gallery_works_to_write_dup.csv', fieldnames)

# write error logs
fieldnames = ['inventory_number', 'string']
write_dicts_to_csv(missing_creators, 'missing_creators.csv', fieldnames)
write_dicts_to_csv(missing_classifications, 'missing_classifications.csv', fieldnames)
write_dicts_to_csv(missing_countries, 'missing_countries.csv', fieldnames)


print('done')


# Remove duplicate label/description combinations

Wikidata does not allow writing any records if their label/description combinations are the same as an existing item. So we need to not try to write records that are duplicates locally. This cell eliminates local duplicates.

In [None]:
filename = 'gallery_works_to_write_dup.csv'
dup_works = read_dict(filename)
#for work in dup_works:
#    print(work['label_en'])
#print()

works = []

count = 0
while len(dup_works) > 0:
    remaining = []
    if count % 10 == 0: # make something show up so we know it's working
        print(count)
    match = False
    for work_index in range(len(dup_works)-1, 0, -1):
        #print(count, dup_works[work_index]['label_en'], '|', dup_works[0]['label_en'])
        if dup_works[work_index]['label_en'] == dup_works[0]['label_en'] and dup_works[work_index]['description_en'] == dup_works[0]['description_en']:
            match = True
            #print('match')
            del dup_works[work_index]
        else:
            remaining.append(dup_works[work_index])
    
    if not match:
        #print('did not match')
        works.append(dup_works[0])
    dup_works = deepcopy(remaining)
    count += 1
        
#print()
#for work in works:
#    print(work['label_en'])
# output data

fieldnames = list(works[0].keys())
write_dicts_to_csv(works, 'gallery_works_to_write.csv', fieldnames)
print('done')

The following cell checks items against Wikidata labels and descriptions (hacked from vb5_check_labels_descriptions.py)

Probably good to do a sort by qid, label, description before running.

In [None]:
filename = 'gallery_works_to_write.csv'
works = read_dict(filename)

#for work_index in range(41,42):
for work_index in range(len(works)):
    if work_index % 10 == 0: # make something show up so we know it's working
        print(work_index)
    if works[work_index]['qid'] == '':
        # Have to do really weird stuff with quotes to avoid problems with strings that contain them
        # Still could have problems if any label starts or ends with a single quote.
        query = """select distinct ?item where {
          ?item rdfs:label '''""" + works[work_index]['label_en'] + """'''@en.
          ?item schema:description '''""" + works[work_index]['description_en'] + """'''@en.
          }"""

        #print('Checking label: "' + works[work_index]['label_en'] + '", description: "' + works[work_index]['description_en'] + '"')
        
        r = requests.post(endpoint, data=query.encode('utf-8'), headers=generate_header_dictionary(accept_media_type))
        try:
            data = r.json()
            results = data['results']['bindings']
            #print(results)

            if len(results) > 0:
                match = extract_qnumber(results[0]['item']['value'])
                print('Warning! Row ' + str(work_index + 2) + ' is the same as ' + match)
        except:
            print(r.text)

print('done')

# --------------

# STOP HERE

# --------------

The following scripts are for one-time use and have already been run!

# Generate classes

This section first gets all of the classes (values for P31 instanceOf) from the Met's collection. Then it tries to match the labels of those items to the values in the classification column of the gallery data. The source file for the `works` list can really be any of the series because the `classification` field isn't pre-processed.

In [None]:
# get a list of all classes in the Met
work_classes = retrieve_gallery_classes()
# print(work_classes)

filename = 'gallery_works_with_dim2.txt'
works = read_dict(filename)

values = non_redundant(works, 'classification')
values.sort(key = sort_funct)
if values[0] == '':
    values.remove('')
print(values)

mappings = []
for value in values:
    found = False
    for work_class in work_classes:
        if value.lower() == work_class['label'].lower():
            found = True
            mappings.append({'string': value, 'qid': work_class['qid'], 'label': work_class['label']})
    if not found:
        mappings.append({'string': value, 'qid': '', 'label': ''})
write_dicts_to_csv(mappings, 'classification_mappings.csv', ['string', 'qid', 'label'])
print()
print('done')


# Generate country table

Match labels in the country_culture column with labels and aliases in Wikidata for various country-like items. NOTE: this requires hand-processing after generation, so it shouldn't be re-run.

In [None]:
values = non_redundant(works, 'country_culture')
values.sort(key = sort_funct)
if values[0] == '':
    values.remove('')
print(values)

mappings = []
for value in values:
    result_list = searchLabelsAtWikidata(value, ['Q6256','Q3624078','Q6266'])
    print('|' + value + '|', result_list)
    if len(result_list) == 1:
        qid = result_list[0]
    elif len(result_list) > 1:
        qid = result_list
    else:
        qid = ''
    mappings.append({'string': value, 'qid': qid})
write_dicts_to_csv(mappings, 'country_mappings.csv', ['string', 'qid'])
print('done')


# Process medium field

The medium field could form the basis of the description field, but is also used to generate the material used (P186) values.

Currently, the `medium.csv` file isn't really used for anything, but it could be used in conjunction with the materials dictionary to describe the materials in the object. This is an area for future work.

In [None]:
# It doesn't really matter which source file is used, since there is no pre-processing done on this column
filename = 'gallery_works_with_dates.txt'
works = read_dict(filename)
fieldnames = list(works[0].keys())
# for field in fieldnames:
#    print(field)

for work_index in range(len(works)):
    if works[work_index]['medium'].strip() != '':
        works[work_index]['medium'] = works[work_index]['medium'].strip()
medium_strings = non_redundant(works, 'medium')
medium_strings.sort(key = sort_funct)
out_table = []
materials_list = []
for string in medium_strings:
    #print(string)
    out_dict = {}
    out_dict['string'] = string
    out_dict['material'] = []

    pieces = string.split(' ')
    if len(pieces) == 1:
        out_dict['material1'] = string
        out_dict['material'].append(string)
        out_dict['material2'] = ''
        out_dict['material'].append('')
    elif not ' on ' in string and ' and ' in string:
        pieces = string.split(' and ')
        out_dict['material1'] = pieces[0]
        out_dict['material'].append(pieces[0])
        out_dict['material2'] = pieces[1]
        out_dict['material'].append(pieces[1])
    else:
        out_dict['material1'] = ''
        out_dict['material'].append('')
        out_dict['material2'] = ''
        out_dict['material'].append('')
        
    if ' on ' in string:
        pieces = string.split(' on ')
        out_dict['medium'] = pieces[0]
        out_dict['material'].append(pieces[0])
        out_dict['surface'] = pieces[1]
        out_dict['material'].append(pieces[1])
    else:
        out_dict['medium'] = ''
        out_dict['material'].append('')
        out_dict['surface'] = ''
        out_dict['material'].append('')
        
    #print(out_dict['material'])
    #print()
    out_table.append(out_dict)
    for material in out_dict['material']:
        materials_list.append(material.lower().strip())

fieldnames = list(out_table[0].keys())
write_dicts_to_csv(out_table, 'medium.csv', fieldnames)

materials_list = non_redundant(materials_list, '')
materials_list.sort(key = sort_funct)
for material in materials_list:
    print(material)


print('done')


The following cell is a one-time script to generate a non-redundant list of material strings. It later gets hand-edited, so don't re-run it!

In [None]:
# load and process materials list derived from above. The materials.csv file was manually created by copy and paste
# of the list at the end of the previous script.
filename = 'materials.csv'
creator_data = read_dict(filename)
fieldnames = list(creator_data[0].keys())

# This is a hack of the creators disambiguation routine below
for creator_index in range(len(creator_data)):
#for creator_index in range(5):
    if creator_data[creator_index]['searched'] == '':
        match = False
        print(creator_data[creator_index]['material'])
        print()
        results = searchNameAtWikidata(creator_data[creator_index]['material'])
        if len(results) == 0:
            print('No results')
            print()
            creator_data[creator_index]['matches'] = 'no'
        elif len(results) == 1:
            print(results)
            print('match with', searchWikidataDescription(results[0]['qId'])['description'])
            creator_data[creator_index]['qid'] = results[0]['qId']
        else:
            creator_data[creator_index]['matches'] = 'yes'
            display_strings = []
            for result_index in range(len(results)):
                wikidata_descriptions = searchWikidataDescription(results[result_index]['qId'])
                description = wikidata_descriptions['description']

                similarity_score = name_variant_testing(creator_data[creator_index]['material'], results[result_index]['name'])
                # if there is an exact dates match and high name similarity, just assign Q ID

                result_name = results[result_index]['name']
                result_qid = results[result_index]['qId']
                display_strings.append({'qid': result_qid, 'name': result_name, 'description': description, 'score': similarity_score})

            display_strings.sort(key = sort_score, reverse = True)
            for index in range(len(display_strings)):
                print(index, display_strings[index]['score'], display_strings[index]['name'], 'https://www.wikidata.org/wiki/' + display_strings[index]['qid'])
                print('Description:', display_strings[index]['description'])
                print()
            match = input('number of match or Enter for no match')
            if match != '':
                creator_data[creator_index]['qid'] = results[result_index]['qId']
        creator_data[creator_index]['searched'] = 'yes'
        write_dicts_to_csv(creator_data, 'materials.csv', fieldnames)
        print()
        print()
print('done')

# Disambiguate creators

The following functions are names-related ones from vb3_match_wikidata.py

The following cell creates a non-redundant list of processed names by extracting them from the creator field, then reversing them to given name first.

Note: don't run this again when there is a creators.csv file already because it will overwrite any data that have been processed by the next script!

In [None]:
creator_strings = non_redundant(works, 'creator_string')
print(len(creator_strings))creator_data = []
for creator_string in creator_strings:
    creator_datum = {}
    creator_datum['last_first'] = remove_parens(creator_string)
    creator_datum['description'] = remove_description(creator_string)
    if ',' in creator_datum['last_first']:
        creator_datum['name'] = reverse_names(creator_datum['last_first'])
    else:
        creator_datum['name'] = creator_datum['last_first']
    creator_datum['creator_string'] = json.dumps([creator_string], ensure_ascii=False)
    creator_data.append(creator_datum)
    
creator_data.sort(key = sort_last_first)

fieldnames = list(creator_data[0].keys())
write_dicts_to_csv(creator_data, 'creators.csv', fieldnames)

print('done')


The following script isn't being maintained any more because there is a stand-alone file, `screen_creators.py`, which is run from the command line. It has edits that aren't found here.

In [None]:
creator_data = read_dict('creators.csv')
fieldnames = list(creator_data[0].keys())

for creator_index in range(len(creator_data)):
# for creator_index in range(22,23):
    if creator_data[creator_index]['searched'] == '':
        match = False
        print(creator_data[creator_index]['name'])
        print(creator_data[creator_index]['description'])
        print()
        results = searchNameAtWikidata(creator_data[creator_index]['name'])
        if len(results) == 0:
            print('No results')
            print()
            creator_data[creator_index]['matches'] = 'no'
        else:
            creator_data[creator_index]['matches'] = 'yes'
            display_strings = []
            for result_index in range(len(results)):
                if human(results[result_index]['qId']):
                    wikidata_descriptions = searchWikidataDescription(results[result_index]['qId'])
                    description = wikidata_descriptions['description']
                    if description[0:18] != 'Peerage person ID=':
                        
                        birthDateList = retrieve_birth_date_query.single_property_values_for_item(results[result_index]['qId'])
                        if len(birthDateList) >= 1:
                            birth_date = birthDateList[0][0:4]
                        else:
                            birth_date = ''
                        
                        deathDateList = retrieve_death_date_query.single_property_values_for_item(results[result_index]['qId'])
                        if len(deathDateList) >= 1:
                            death_date = deathDateList[0][0:4]
                        else:
                            death_date = ''
                        
                        if death_date != '' and birth_date != '':
                            dates = birth_date + '-' + death_date
                        elif death_date == '' and birth_date != '':
                            dates = 'born ' + birth_date
                        elif death_date != '' and birth_date == '':
                            dates = 'died ' + death_date
                        else:
                            dates = ''
                            
                        similarity_score = name_variant_testing(creator_data[creator_index]['name'], results[result_index]['name'])
                        # if there is an exact dates match and high name similarity, just assign Q ID
                        if dates != '' and dates in creator_data[creator_index]['description'] and int(similarity_score) > 95:
                            match = True
                            creator_data[creator_index]['qid'] = results[result_index]['qId']
                            print('Auto match with', results[result_index]['name'], dates, 'https://www.wikidata.org/wiki/' + results[result_index]['qId'])
                            break # kill the results loop
                        else:
                            occupation = wikidata_descriptions['occupation']
                            result_name = results[result_index]['name']
                            result_qid = results[result_index]['qId']
                            display_strings.append({'qid': result_qid, 'name': result_name, 'dates': dates, 'description': description, 'occupation': occupation, 'score': similarity_score})

            if match:
                pass
            elif len(display_strings) == 0:
                print('No results')
                print()
            else:
                display_strings.sort(key = sort_score, reverse = True)
                for index in range(len(display_strings)):
                    print(index, display_strings[index]['score'], display_strings[index]['name'], 'https://www.wikidata.org/wiki/' + display_strings[index]['qid'])
                    print(display_strings[index]['dates'])
                    print('Description:', display_strings[index]['description'])
                    print('Occupation:', display_strings[index]['occupation'])
                    print()
                match = input('number of match or Enter for no match')
                if match != '':
                    creator_data[creator_index]['qid'] = results[result_index]['qId']
        creator_data[creator_index]['searched'] = 'yes'
        write_dicts_to_csv(creator_data, 'creators.csv', fieldnames)
        print()
        print()
