# Extract  data

This section simplifies the column headers and writes a copy of the data to a CSV

In [None]:
from pathlib import Path
import requests
from time import sleep
import json
import csv
import os
from fuzzywuzzy import fuzz # fuzzy logic matching

# ----------------
# Configuration settings
# ----------------

sparql_sleep = 0.1 # number of seconds to wait between queries to SPARQL endpoint
home = str(Path.home()) # gets path to home directory; supposed to work for both Win and Mac
endpoint = 'https://query.wikidata.org/sparql'
accept_media_type = 'application/json'

# ----------------
# Utility functions
# ----------------

# Best to send a user-agent header because some Wikimedia servers don't like unidentified clients
def generate_header_dictionary(accept_media_type):
    user_agent_header = 'VanderBot/1.6 (https://github.com/HeardLibrary/linked-data/tree/master/vanderbot; mailto:steve.baskauf@vanderbilt.edu)'
    request_header_dictionary = {
        'Accept' : accept_media_type,
        'Content-Type': 'application/sparql-query',
        'User-Agent': user_agent_header
    }
    return request_header_dictionary

requestheader = generate_header_dictionary(accept_media_type)

# read from a CSV file into a list of dictionaries
def read_dict(filename):
    with open(filename, 'r', newline='', encoding='utf-8') as file_object:
        dict_object = csv.DictReader(file_object)
        array = []
        for row in dict_object:
            array.append(row)
    return array

# write a list of dictionaries to a CSV file
def write_dicts_to_csv(table, filename, fieldnames):
    with open(filename, 'w', newline='', encoding='utf-8') as csv_file_object:
        writer = csv.DictWriter(csv_file_object, fieldnames=fieldnames)
        writer.writeheader()
        for row in table:
            writer.writerow(row)

# find non-redundant values for a column
def non_redundant(table, column_key):
    non_redundant_list = []
    for row in table:
        found = False
        for test_item in non_redundant_list:
            if row[column_key] == test_item:
                found = True
                break
        if not found:
            non_redundant_list.append(row[column_key])
    return non_redundant_list

# function to use in sort of simple list
def sort_funct(row):
    return row

# function to use in sort last_first names
def sort_last_first(row):
    return row['last_first']

# function to use in sort by match score
def sort_score(row):
    return row['score']

# extracts the qNumber from a Wikidata IRI
def extract_qnumber(iri):
    # pattern is http://www.wikidata.org/entity/Q6386232
    pieces = iri.split('/')
    return pieces[4]

# search label and alias
# For whatever reason, if I use the graph pattern

# wd:Q21 wdt:P31 ?class.

# England is not Q6256 (country)
# But if I use the graph pattern

#   wd:Q21 p:P31 ?statement.
#  ?statement ps:P31 ?class.

# it is ??!!
def searchLabelsAtWikidata(string, class_list):
    # create a string for the query
    query = 'select distinct ?id '
    query += '''where {
  {?id rdfs:label "''' + string + '''"@en.}
  union
  {?id skos:altLabel "''' + string + '''"@en.}
  '''
    for class_index in range(len(class_list)):
        if class_index == 0:
            query += '''{?id p:P31 ?statement.
  ?statement ps:P31 wd:''' + class_list[class_index] + '''.}
  '''
        else:
            query += '''union
  {?id p:P31 ?statement.
  ?statement ps:P31 wd:''' + class_list[class_index] + '''.}
  '''
    query += '''}'''
    #print(query)

    return_value = []
    # r = requests.get(endpointUrl, params={'query' : query}, headers=requestHeaderDictionary)
    r = requests.post(endpoint, data=query.encode('utf-8'), headers=generate_header_dictionary(accept_media_type))
    data = r.json()
    results = data['results']['bindings']
    for result in results:
        qid = extract_qnumber(result['id']['value'])
        return_value.append(qid)

    # delay a quarter second to avoid hitting the SPARQL endpoint too rapidly
    sleep(sparql_sleep)
    
    return return_value

def retrieve_gallery_classes():
    # create a string for the query
    # use Metropolitan Museum of Art because there are too many collections to not specify the collection.
    query = '''select distinct ?class ?label where 
      {
      ?item wdt:P195 wd:Q160236.
      ?item wdt:P31 ?class.
      ?class rdfs:label ?label.
      filter(lang(?label) = 'en')
      }
      order by ?label'''

    #print(query)

    return_value = []
    print('sending query')
    r = requests.post(endpoint, data=query.encode('utf-8'), headers=generate_header_dictionary(accept_media_type))
    print('results returned')
    data = r.json()
    results = data['results']['bindings']
    for result in results:
        qid = extract_qnumber(result['class']['value'])
        label = result['label']['value']
        return_value.append({'label': label, 'qid': qid})

    # delay a quarter second to avoid hitting the SPARQL endpoint too rapidly
    sleep(sparql_sleep)
    
    return return_value

def generateNameAlternatives(name):
    # treat commas as if they were spaces
    name = name.replace(',', ' ')
    # get rid of periods
    name = name.replace('.', '')

    pieces = name.split(' ')
    
    # Remove ", Jr.", "III", etc. from end of name
    if pieces[len(pieces)-1] == 'Jr':
        pieces = pieces[0:len(pieces)-1]
        suffix = ', Jr.'
    elif pieces[len(pieces)-1] == 'II':
        pieces = pieces[0:len(pieces)-1]
        suffix = ' II'
    elif pieces[len(pieces)-1] == 'III':
        pieces = pieces[0:len(pieces)-1]
        suffix = ' III'
    elif pieces[len(pieces)-1] == 'IV':
        pieces = pieces[0:len(pieces)-1]
        suffix = ' IV'
    elif pieces[len(pieces)-1] == 'V':
        pieces = pieces[0:len(pieces)-1]
        suffix = ' V'
    elif len(pieces) > 3 and pieces[len(pieces)-2] == 'the' and pieces[len(pieces)-1] == 'elder':
        pieces = pieces[0:len(pieces)-2]
        suffix = ' the elder'
    else:
        suffix = ''

    # generate initials for all names
    initials = []
    for piece in pieces:
        # make sure first character is alphabetic
        # only fixes the case where there is one alphanumeric, but more than one is rare
        # typical cases are like (Kit) or "Kit"
        if not piece[0:1].isalpha():
            piece = piece[1:len(piece)] # remove the first non-alphabetic character
        if len(piece) > 0:
            initials.append(piece[0:1])
        
    alternatives = []
    # full name
    nameVersion = ''
    for pieceNumber in range(0, len(pieces)-1):
        nameVersion += pieces[pieceNumber] + ' '
    nameVersion += pieces[len(pieces)-1]
    alternatives.append(nameVersion)
    
    # full name with suffix
    if suffix != '':
        nameVersion = ''
        for pieceNumber in range(0, len(pieces)-1):
            nameVersion += pieces[pieceNumber] + ' '
        nameVersion += pieces[len(pieces)-1] + suffix
        alternatives.append(nameVersion)
    
    # first and last name with initials
    nameVersion = pieces[0] + ' '
    for pieceNumber in range(1, len(pieces)-1):
        nameVersion += initials[pieceNumber] + ' '
    nameVersion += pieces[len(pieces)-1]
    alternatives.append(nameVersion)
    
    # first and last name with initials and periods
    nameVersion = pieces[0] + ' '
    for pieceNumber in range(1, len(pieces)-1):
        nameVersion += initials[pieceNumber] + '. '
    nameVersion += pieces[len(pieces)-1]
    alternatives.append(nameVersion)

    # first and last name only
    nameVersion = pieces[0] + ' '
    nameVersion += pieces[len(pieces)-1]
    alternatives.append(nameVersion)

    # first initial and last name only
    nameVersion = initials[0] + ' '
    nameVersion += pieces[len(pieces)-1]
    alternatives.append(nameVersion)

    # first initial with period and last name only
    nameVersion = initials[0] + '. '
    nameVersion += pieces[len(pieces)-1]
    alternatives.append(nameVersion)

    # all name initials with last name
    nameVersion = initials[0] + ' '
    for pieceNumber in range(1, len(pieces)-1):
        nameVersion += initials[pieceNumber] + ' '
    nameVersion += pieces[len(pieces)-1]
    alternatives.append(nameVersion)

    # all name initials with periods with last name
    nameVersion = ''
    for pieceNumber in range(0, len(pieces)-1):
        nameVersion += initials[pieceNumber] + '. '
    nameVersion += pieces[len(pieces)-1]
    alternatives.append(nameVersion)

    # all name initials concatenated with last name
    nameVersion = ''
    for pieceNumber in range(0, len(pieces)-1):
        nameVersion += initials[pieceNumber]
    nameVersion += ' ' + pieces[len(pieces)-1]
    alternatives.append(nameVersion)
    
    # remove duplicates
    dedupe = list(set(alternatives))

    return dedupe

def searchNameAtWikidata(name):
    nameList = generateNameAlternatives(name)
    alternatives = ''
    for alternative in nameList:
        # get rid of quotes, which will break the query
        alternative = alternative.replace('"', '')
        alternative = alternative.replace("'", '')
        alternatives += '"' + alternative + '"@en\n'
    query = '''
select distinct ?item ?label where {
  VALUES ?value
  {
  ''' + alternatives + '''}
?item rdfs:label|skos:altLabel ?value.
?item rdfs:label ?label.
FILTER(lang(?label)='en')
  }
'''
    #print(query)
    #print('searching for ', name)
    results = []
    # r = requests.get(wikidataEndpointUrl, params={'query' : query}, headers=requestHeaderDictionary)
    r = requests.post(endpoint, data=query.encode('utf-8'), headers=requestheader)
    try:
        data = r.json()
        statements = data['results']['bindings']
        for statement in statements:
            wikidataIri = statement['item']['value']
            if 'label' in statement:
                name = statement['label']['value']
            else:
                name = ''
            qNumber = vbc.extract_qnumber(wikidataIri)
            results.append({'qId': qNumber, 'name': name})
    except:
        results = [{'error': r.text}]
    # delay a quarter second to avoid hitting the SPARQL endpoint too rapidly
    sleep(sparql_sleep)
    return results

def name_variant_testing(name, variant):
    # get rid of periods
    name = name.replace('.','')
    variant = variant.replace('.','')
    
    # create first names
    name_pieces = name.split(' ')
    variant_pieces = variant.split(' ')
    last_name = name_pieces[len(name_pieces)-1]
    last_variant = variant_pieces[len(variant_pieces)-1]
    if len(name_pieces) > 1:
        first_names = name[0:-(len(last_name)+1)]
    else:
        first_names = name     
    if len(variant_pieces) > 1:
        first_variants = variant[0:-(len(last_variant)+1)]
    else:
        first_variants = variant      
    #print(first_names)
    #print(first_variants)
    
    # compare first names
    # I experimented with the different ratios and I think fuzz might be best.
    ratio = fuzz.ratio(first_names, first_variants)
    #partial_ratio = fuzz.partial_ratio(first_names, first_variants)
    #sort_ratio = fuzz.token_sort_ratio(first_names, first_variants)
    #set_ratio = fuzz.token_set_ratio(first_names, first_variants)
    # print('name similarity ratio', ratio)
    #print('partial ratio', partial_ratio)
    #print('sort_ratio', sort_ratio)
    #print('set_ratio', set_ratio)

    return(ratio)

def find_surname_givens(name):
    # Get rid of periods and commas
    name = name.replace('.', '')
    name = name.replace(',', '')
    
    # Split name
    pieces = name.split(' ')
    # Must be at least a surname and something else
    if len(pieces) <= 1:
        return False
    
    # Make sure first character is alphabetic
    # only fixes the case where there is one alphanumeric, but more than one is rare
    # typical cases are like (Kit) or "Kit"    
    for piece_index in range(len(pieces)):
        if not pieces[piece_index][0:1].isalpha(): 
            pieces[piece_index] = pieces[piece_index][1:len(pieces)] # remove the first non-alphabetic character
    # Now get rid of any empty strings; could also be caused by double spaces
    for piece in pieces:
        if len(piece) == 0: # there's nothing left, get rid of piece
            pieces.remove('')
            
    # Get rid of ", Jr.", "III", etc.
    if 'Jr' in pieces:
        pieces.remove('Jr')
    if 'Sr' in pieces:
        pieces.remove('Sr')
    if 'II' in pieces:
        pieces.remove('II')
    if 'III' in pieces:
        pieces.remove('III')
    if 'IV' in pieces:
        pieces.remove('IV')
    if 'V' in pieces:
        pieces.remove('V')
    
    # Not interested unless there are at least two pieces
    if len(pieces) == 1:
        return False
    
    # Put all but last piece together again
    given_names = ''
    for piece in pieces[0:len(pieces)-2]:
        given_names += piece + ' '
    given_names += pieces[len(pieces)-2]
    
    return {'given': given_names, 'family': pieces[len(pieces)-1]}

def remove_parens(string):
    name_string = string.split('(')[0]
    return name_string.strip()

def remove_description(string):
    try:
        right_string = string.split('(')[1]
        left_string = right_string.split(')')[0]
        result = left_string.strip()
    except:
        result = ''
    return result

def reverse_names(string):
    pieces = string.split(',')
    return pieces[1].strip() + ' ' + pieces[0].strip()

# Screens for Wikidata items that are potential matches

import vb_common_code as vbc
retrieve_class_list_query = vbc.Query(pid='P31', uselabel=False, sleep=sparql_sleep)
retrieve_birth_date_query = vbc.Query(isitem=False, pid='P569', sleep=sparql_sleep)
retrieve_death_date_query = vbc.Query(isitem=False, pid='P570', sleep=sparql_sleep)

def human(qId):
    screen = True
    wdClassList = retrieve_class_list_query.single_property_values_for_item(qId)
    # if there is a class property, check if it's a human
    if len(wdClassList) != 0:
        # if it's not a human
        if wdClassList[0] != 'Q5':
            #print('*** This item is not a human!')
            screen = False
    return screen

# returns a dictionary of various descriptors of the item with Wikidata ID qId
# P106 is occupation, schema:description is filtered to be the English description
def searchWikidataDescription(qId):
    resultsDict = {}
    query = '''select distinct ?description ?orcid ?occupation where {
        optional {
            wd:'''+ qId + ''' schema:description ?description.
            FILTER(lang(?description) = 'en')
            }
        optional {
            wd:'''+ qId + ''' wdt:P106 ?occupationId.
            ?occupationId rdfs:label ?occupation.
            FILTER(lang(?occupation) = 'en')            
            }
        optional {wd:'''+ qId + ''' wdt:P496 ?orcid.}
      }'''
    #print(query)
    r = requests.post(endpoint, data=query.encode('utf-8'), headers=requestheader)
    try:
        data = r.json()
        statements = data['results']['bindings']
        if len(statements) > 0: # if no results, the dictionary remains empty
            # Only a single description per language is allowed, so there should only be one description
            if 'description' in statements[0]:
                description = statements[0]['description']['value']
            else:
                description = ''
            resultsDict['description'] = description
            
            # Only a single ORCID is allowed, so there should only be one orcid value
            if 'orcid' in statements[0]:
                orcid = statements[0]['orcid']['value']
            else:
                orcid = ''
            resultsDict['orcid'] = orcid
            
            # if there are multiple statements, that's because there are more than one occupation
            occupationList = []
            for statement in statements:
                if 'occupation' in statement:
                    occupationList.append(statement['occupation']['value'])
            resultsDict['occupation'] = occupationList
    except:
        resultsDict = {'error': r.text}
    # delay a quarter second to avoid hitting the SPARQL endpoint too rapidly
    sleep(sparql_sleep)
    return resultsDict


In [None]:
filename = 'gallery_works.csv'
old_works = read_dict(filename)
fieldnames = list(old_works[0].keys())
print(fieldnames)


In [None]:
works = []
for old_work in old_works:
    work = {}
    work['ssid'] = old_work['SSID']
    work['filename'] = old_work['Filename']
    work['title'] = old_work['Title[637073]'] 
    work['creator'] = old_work['Creator[637071]']
    work['date'] = old_work['Date[637076]']
    work['classification'] = old_work['Classification[637103]']
    work['medium'] = old_work['Medium[637080]']
    work['measurements'] = old_work['Measurements[637081]']
    work['style_period'] = old_work['Style/Period[637079]']
    work['country_culture'] = old_work['Country/Culture[637072]']
    work['seals_inscriptions'] = old_work['Seals & Inscriptions[637104]']
    work['signature'] = old_work['Signature[637105]']
    work['description'] = old_work['Description[637092]']
    work['publications'] = old_work['Publications[637106]']
    work['exhibitions'] = old_work['Exhibitions[637107]']
    work['accession_number'] = old_work['Accession Number[637085]']
    work['date_acquired'] = old_work['Date Acquired[637109]']
    work['gift_of'] = old_work['Gift of[637110]']
    work['purchased_from'] = old_work['Purchased from[637111]']
    work['credit_line'] = old_work['Credit Line[637112]']
    work['provenance'] = old_work['Provenance[637113]']
    work['collection'] = old_work['Collection[637114]']
    work['last_change'] = old_work['Last Change[637115]']
    work['notes'] = old_work['Notes[637116]']
    work['rights'] = old_work['Rights[637099]']
    work['media_url'] = old_work['Media URL']
    # License and the Artstor fields are not populated
        
    works.append(work)
        
fieldnames = list(works[0].keys())
write_dicts_to_csv(works, 'gallery_works_renamed.csv', fieldnames)

print('done')

# Generate country table

Match labels in the country_culture column with labels and aliases in Wikidata for various country-like items

In [None]:
values = non_redundant(works, 'country_culture')
values.sort(key = sort_funct)
if values[0] == '':
    values.remove('')
print(values)

mappings = []
for value in values:
    result_list = searchLabelsAtWikidata(value, ['Q6256','Q3624078','Q6266'])
    print('|' + value + '|', result_list)
    if len(result_list) == 1:
        qid = result_list[0]
    elif len(result_list) > 1:
        qid = result_list
    else:
        qid = ''
    mappings.append({'string': value, 'qid': qid})
write_dicts_to_csv(mappings, 'country_mappings.csv', ['string', 'qid'])
print('done')


# Generate classes

This section first gets all of the classes (values for P31 instanceOf) from the Met's collection. Then it tries to match the labels of those items to the values in the classification column of the gallery data.

In [None]:
# get a list of all classes in the Met
work_classes = retrieve_gallery_classes()
print(work_classes)

In [None]:
values = non_redundant(works, 'classification')
values.sort(key = sort_funct)
if values[0] == '':
    values.remove('')
print(values)

mappings = []
for value in values:
    found = False
    for work_class in work_classes:
        if value.lower() == work_class['label'].lower():
            found = True
            mappings.append({'string': value, 'qid': work_class['qid'], 'label': work_class['label']})
    if not found:
        mappings.append({'string': value, 'qid': '', 'label': ''})
write_dicts_to_csv(mappings, 'classification_mappings.csv', ['string', 'qid', 'label'])
print()
print('done')


# Parse dimensions data

This processes the values of the measurements column and separates them into length and width or length/width/height quantities.

In [None]:
for work_index in range(len(works)):
#for work_index in range(0,1):
    string = works[work_index]['measurements'].strip()
    if string == '':
        # no value; set all variables to empty string
        height = ''
        width = ''
        depth = ''
        circum = ''
    elif ' x ' not in string:
        # one dimensional or improperly formatted
        pieces = string.split(' ')
        try:
            value = float(pieces[0])
            if pieces[1] != 'in.':
                # second part of string not "in."
                print(works[work_index]['accession_number'], string)
                height = ''
                width = ''
                depth = ''
                circum = ''
            else:
                height = ''
                width = ''
                depth = ''
                circum = str(value)
        except:
            # improperly formatted            
            # set all variables to empty string
            print(works[work_index]['accession_number'], string)
            height = ''
            width = ''
            depth = ''
            circum = ''
    else:
        # the string has an x in it, so it's multidimensional
        pieces = string.split('x')
        # split the string and get rid of leading and trailing whitespace
        for piece_index in range(len(pieces)):
            pieces[piece_index] = pieces[piece_index].strip()
        # remove the "in." and any spaces from the last piece
        pieces[len(pieces)-1] = pieces[len(pieces)-1].split('in')[0].strip()
        if len(pieces) == 2:
            # two-dimensional work
            try:
                height = str(float(pieces[0]))
                width = str(float(pieces[1]))
                depth = ''
                circum = ''
            except:
                print(works[work_index]['accession_number'], string)
                height = ''
                width = ''
                depth = ''
                circum = ''                
        else:
            # three-dimensional work
            try:
                height = str(float(pieces[0]))
                width = str(float(pieces[1]))
                depth = str(float(pieces[2]))
                circum = ''
            except:
                print(works[work_index]['accession_number'], string)
                height = ''
                width = ''
                depth = ''
                circum = ''
    works[work_index]['height'] = height
    works[work_index]['width'] = width
    works[work_index]['depth'] = depth
    works[work_index]['circum'] = circum

fieldnames = list(works[0].keys())
write_dicts_to_csv(works, 'gallery_works_with_dim.csv', fieldnames)

print('done')


# Parse inception dates

In [None]:
filename = 'gallery_works_with_dim.csv'
works = read_dict(filename)
fieldnames = list(works[0].keys())
print(fieldnames)

In [None]:
def determine_era(string):
    # dates with no CE or BCE, including empty string
    if 'CE' not in string:
        value = string
        era = 'unknown'      
    else:
        if 'BCE' in string:
            value = string[0:len(string)-3].strip()
            era = 'BCE'
        else: # string ends with CE
            value = string[0:len(string)-2].strip()
            era = 'CE'
    return value, era

for work_index in range(len(works)):
    string = works[work_index]['date']
    
    # handle ideosyncratic date values
    if string == 'not dated':
        string = ''
    if string == 'Unknown':
        string = ''
    if string[0:5] == 'late ':
        string = string[5:]
    if string[0:4] == 'mid ':
        string = string[4:]
    # drop parenthetical comments after the dates
    if '(' in string:
        pieces = string.split('(')
        string = pieces[0].strip()
    # fix bad century designation
    if 'th CE' in string:
        pieces = string.split('th CE')
        string = pieces[0] + 'th century CE'
        

    # split dates
    date_list = ['', '', ''] # 0 is single date, 1 is beginning of range, 2 is end of range
    if '-' in string:
        date_list[0] = ''
        date_list[1] = string.split('-')[0].strip()
        date_list[2] = string.split('-')[1].strip()
    elif 'to' in string:
        date_list[0] = ''
        date_list[1] = string.split('to')[0].strip()
        date_list[2] = string.split('to')[1].strip()
        # handle special case of "late ... to early ..."
        if date_list[2][0:6] == 'early ':
            date_list[2] = date_list[2][6:]
    else:
        date_list[0] = string.strip()
        date_list[1] = ''
        date_list[2] = ''

    # extract CE and BCE
    for date_index in range(len(date_list)):
        date_dict = {}
        date_dict['value'], date_dict['era'] = determine_era(date_list[date_index])
        date_list[date_index] = date_dict
    
    # If last date in range has a designation and the first one doesn't, assign it to the first date.
    if date_list[1]['value'] != '' and date_list[1]['era'] == 'unknown' and date_list[2]['era'] != 'unknown':
        date_list[1]['era'] = date_list[2]['era']
        
    # For dates with no specified era, assign CE
    for date_index in range(len(date_list)):
        if date_list[date_index]['value'] != '' and date_list[date_index]['era'] == 'unknown':
            date_list[date_index]['era'] = 'CE'
    
    # Create a date dict to hold more information about the date format
    date_dict = {'dates': date_list}
    
    # Determine if date is circa
    date_dict['circa'] = False
    for date_index in range(len(date_dict['dates'])):
        if date_list[date_index]['value'][0:3] == 'ca.':
            date_dict['circa'] = True
            date_dict['dates'][date_index]['value'] = date_dict['dates'][date_index]['value'][3:].strip()
    
    # Determine if values are centuries
    date_dict['century'] = False
    for date_index in range(len(date_dict['dates'])):
        if date_dict['dates'][date_index]['value'][-7:] == 'century':
            date_dict['century'] = True
            date_dict['dates'][date_index]['value'] = date_dict['dates'][date_index]['value'][:-7].strip()
    if date_dict['century']: # if determined to be century values, strip off the "th"
        for date_index in range(len(date_dict['dates'])):
            date_dict['dates'][date_index]['value'] = date_dict['dates'][date_index]['value'][:-2]
    # append date dict to works
    works[work_index]['inception'] = date_dict

# check for bad dates
print('Dates with problems that need to be fixed manually')
for work_index in range(len(works)):
    for date in works[work_index]['inception']['dates']:
        if date['value'] != '':
            try:
                junk = int(date['value'])
            except:
                print(work_index, works[work_index]['date'])
    # check for two-digit second numbers in ranges
    if works[work_index]['inception']['dates'][1]['value'] != '' and works[work_index]['inception']['dates'][1]['era'] == 'CE':
        if int(works[work_index]['inception']['dates'][1]['value']) > int(works[work_index]['inception']['dates'][2]['value']):
            print(work_index, works[work_index]['date'])
        


In [None]:
out_table = []
for work in works:
    out_dict = {}
    out_dict['string'] = work['date']
    out_dict['singe_date'] = work['inception']['dates'][0]['value']
    out_dict['singe_era'] = work['inception']['dates'][0]['era']
    out_dict['first_date'] = work['inception']['dates'][1]['value']
    out_dict['first_era'] = work['inception']['dates'][1]['era']
    out_dict['second_date'] = work['inception']['dates'][2]['value']
    out_dict['second_era'] = work['inception']['dates'][2]['era']
    out_dict['circa'] = work['inception']['circa']
    out_dict['century'] = work['inception']['century']
    out_table.append(out_dict)

fieldnames = list(out_table[0].keys())
write_dicts_to_csv(out_table, 'test_dates.csv', fieldnames)

print('done')


# Disambiguate creators

The following functions are names-related ones from vb3_match_wikidata.py

The following cell creates a non-redundant list of processed names by extracting them from the creator field, then reversing them to given name first.

Note: don't run this again when there is a creators.csv file already because it will overwrite any data that have been processed by the next script!

In [None]:
creator_strings = non_redundant(works, 'creator')
print(len(creator_strings))creator_data = []
for creator_string in creator_strings:
    creator_datum = {}
    creator_datum['last_first'] = remove_parens(creator_string)
    creator_datum['description'] = remove_description(creator_string)
    if ',' in creator_datum['last_first']:
        creator_datum['name'] = reverse_names(creator_datum['last_first'])
    else:
        creator_datum['name'] = creator_datum['last_first']
    creator_datum['creator_string'] = json.dumps([creator_string], ensure_ascii=False)
    creator_data.append(creator_datum)
    
creator_data.sort(key = sort_last_first)

fieldnames = list(creator_data[0].keys())
write_dicts_to_csv(creator_data, 'creators.csv', fieldnames)

print('done')


The following script isn't being maintained any more because there is a stand-alone file, `screen_creators.py`, which is run from the command line. It has edits that aren't found here.

In [None]:
creator_data = read_dict('creators.csv')
fieldnames = list(creator_data[0].keys())

for creator_index in range(len(creator_data)):
# for creator_index in range(22,23):
    if creator_data[creator_index]['searched'] == '':
        match = False
        print(creator_data[creator_index]['name'])
        print(creator_data[creator_index]['description'])
        print()
        results = searchNameAtWikidata(creator_data[creator_index]['name'])
        if len(results) == 0:
            print('No results')
            print()
            creator_data[creator_index]['matches'] = 'no'
        else:
            creator_data[creator_index]['matches'] = 'yes'
            display_strings = []
            for result_index in range(len(results)):
                if human(results[result_index]['qId']):
                    wikidata_descriptions = searchWikidataDescription(results[result_index]['qId'])
                    description = wikidata_descriptions['description']
                    if description[0:18] != 'Peerage person ID=':
                        
                        birthDateList = retrieve_birth_date_query.single_property_values_for_item(results[result_index]['qId'])
                        if len(birthDateList) >= 1:
                            birth_date = birthDateList[0][0:4]
                        else:
                            birth_date = ''
                        
                        deathDateList = retrieve_death_date_query.single_property_values_for_item(results[result_index]['qId'])
                        if len(deathDateList) >= 1:
                            death_date = deathDateList[0][0:4]
                        else:
                            death_date = ''
                        
                        if death_date != '' and birth_date != '':
                            dates = birth_date + '-' + death_date
                        elif death_date == '' and birth_date != '':
                            dates = 'born ' + birth_date
                        elif death_date != '' and birth_date == '':
                            dates = 'died ' + death_date
                        else:
                            dates = ''
                            
                        similarity_score = name_variant_testing(creator_data[creator_index]['name'], results[result_index]['name'])
                        # if there is an exact dates match and high name similarity, just assign Q ID
                        if dates != '' and dates in creator_data[creator_index]['description'] and int(similarity_score) > 95:
                            match = True
                            creator_data[creator_index]['qid'] = results[result_index]['qId']
                            print('Auto match with', results[result_index]['name'], dates, 'https://www.wikidata.org/wiki/' + results[result_index]['qId'])
                            break # kill the results loop
                        else:
                            occupation = wikidata_descriptions['occupation']
                            result_name = results[result_index]['name']
                            result_qid = results[result_index]['qId']
                            display_strings.append({'qid': result_qid, 'name': result_name, 'dates': dates, 'description': description, 'occupation': occupation, 'score': similarity_score})

            if match:
                pass
            elif len(display_strings) == 0:
                print('No results')
                print()
            else:
                display_strings.sort(key = sort_score, reverse = True)
                for index in range(len(display_strings)):
                    print(index, display_strings[index]['score'], display_strings[index]['name'], 'https://www.wikidata.org/wiki/' + display_strings[index]['qid'])
                    print(display_strings[index]['dates'])
                    print('Description:', display_strings[index]['description'])
                    print('Occupation:', display_strings[index]['occupation'])
                    print()
                match = input('number of match or Enter for no match')
                if match != '':
                    creator_data[creator_index]['qid'] = results[result_index]['qId']
        creator_data[creator_index]['searched'] = 'yes'
        write_dicts_to_csv(creator_data, 'creators.csv', fieldnames)
        print()
        print()
