In [None]:
from pathlib import Path
import requests
from time import sleep
import json
import csv
import os

# ----------------
# Configuration settings
# ----------------

sparql_sleep = 0.1 # number of seconds to wait between queries to SPARQL endpoint
home = str(Path.home()) # gets path to home directory; supposed to work for both Win and Mac
endpoint = 'https://query.wikidata.org/sparql'
accept_media_type = 'application/json'

# ----------------
# Utility functions
# ----------------

# Best to send a user-agent header because some Wikimedia servers don't like unidentified clients
def generate_header_dictionary(accept_media_type):
    user_agent_header = 'VanderBot/1.6 (https://github.com/HeardLibrary/linked-data/tree/master/vanderbot; mailto:steve.baskauf@vanderbilt.edu)'
    request_header_dictionary = {
        'Accept' : accept_media_type,
        'Content-Type': 'application/sparql-query',
        'User-Agent': user_agent_header
    }
    return request_header_dictionary

requestheader = generate_header_dictionary(accept_media_type)

# read from a CSV file into a list of dictionaries
def read_dict(filename):
    with open(filename, 'r', newline='', encoding='utf-8') as file_object:
        dict_object = csv.DictReader(file_object)
        array = []
        for row in dict_object:
            array.append(row)
    return array

# write a list of dictionaries to a CSV file
def write_dicts_to_csv(table, filename, fieldnames):
    with open(filename, 'w', newline='', encoding='utf-8') as csv_file_object:
        writer = csv.DictWriter(csv_file_object, fieldnames=fieldnames)
        writer.writeheader()
        for row in table:
            writer.writerow(row)

# find non-redundant values for a column
def non_redundant(table, column_key):
    non_redundant_list = []
    for row in table:
        found = False
        for test_item in non_redundant_list:
            if row[column_key] == test_item:
                found = True
                break
        if not found:
            non_redundant_list.append(row[column_key])
    return non_redundant_list

# function to use in sort
def sort_funct(row):
    return row

# extracts the qNumber from a Wikidata IRI
def extract_qnumber(iri):
    # pattern is http://www.wikidata.org/entity/Q6386232
    pieces = iri.split('/')
    return pieces[4]

# search label and alias
# For whatever reason, if I use the graph pattern

# wd:Q21 wdt:P31 ?class.

# England is not Q6256 (country)
# But if I use the graph pattern

#   wd:Q21 p:P31 ?statement.
#  ?statement ps:P31 ?class.

# it is ??!!
def searchLabelsAtWikidata(string, class_list):
    # create a string for the query
    query = 'select distinct ?id '
    query += '''where {
  {?id rdfs:label "''' + string + '''"@en.}
  union
  {?id skos:altLabel "''' + string + '''"@en.}
  '''
    for class_index in range(len(class_list)):
        if class_index == 0:
            query += '''{?id p:P31 ?statement.
  ?statement ps:P31 wd:''' + class_list[class_index] + '''.}
  '''
        else:
            query += '''union
  {?id p:P31 ?statement.
  ?statement ps:P31 wd:''' + class_list[class_index] + '''.}
  '''
    query += '''}'''
    #print(query)

    return_value = []
    # r = requests.get(endpointUrl, params={'query' : query}, headers=requestHeaderDictionary)
    r = requests.post(endpoint, data=query.encode('utf-8'), headers=generate_header_dictionary(accept_media_type))
    data = r.json()
    results = data['results']['bindings']
    for result in results:
        qid = extract_qnumber(result['id']['value'])
        return_value.append(qid)

    # delay a quarter second to avoid hitting the SPARQL endpoint too rapidly
    sleep(sparql_sleep)
    
    return return_value

def retrieve_gallery_classes():
    # create a string for the query
    # use Metropolitan Museum of Art because there are too many collections to not specify the collection.
    query = '''select distinct ?class ?label where 
      {
      ?item wdt:P195 wd:Q160236.
      ?item wdt:P31 ?class.
      ?class rdfs:label ?label.
      filter(lang(?label) = 'en')
      }
      order by ?label'''

    #print(query)

    return_value = []
    print('sending query')
    r = requests.post(endpoint, data=query.encode('utf-8'), headers=generate_header_dictionary(accept_media_type))
    print('results returned')
    data = r.json()
    results = data['results']['bindings']
    for result in results:
        qid = extract_qnumber(result['class']['value'])
        label = result['label']['value']
        return_value.append({'label': label, 'qid': qid})

    # delay a quarter second to avoid hitting the SPARQL endpoint too rapidly
    sleep(sparql_sleep)
    
    return return_value


# Extract  data

This section simplifies the column headers and writes a copy of the data to a CSV

In [None]:
filename = 'gallery_works.csv'
old_works = read_dict(filename)
fieldnames = list(old_works[0].keys())
print(fieldnames)


In [None]:
works = []
for old_work in old_works:
    work = {}
    work['ssid'] = old_work['SSID']
    work['filename'] = old_work['Filename']
    work['title'] = old_work['Title[637073]'] 
    work['creator'] = old_work['Creator[637071]']
    work['date'] = old_work['Date[637076]']
    work['classification'] = old_work['Classification[637103]']
    work['medium'] = old_work['Medium[637080]']
    work['measurements'] = old_work['Measurements[637081]']
    work['style_period'] = old_work['Style/Period[637079]']
    work['country_culture'] = old_work['Country/Culture[637072]']
    work['seals_inscriptions'] = old_work['Seals & Inscriptions[637104]']
    work['signature'] = old_work['Signature[637105]']
    work['description'] = old_work['Description[637092]']
    work['publications'] = old_work['Publications[637106]']
    work['exhibitions'] = old_work['Exhibitions[637107]']
    work['accession_number'] = old_work['Accession Number[637085]']
    work['date_acquired'] = old_work['Date Acquired[637109]']
    work['gift_of'] = old_work['Gift of[637110]']
    work['purchased_from'] = old_work['Purchased from[637111]']
    work['credit_line'] = old_work['Credit Line[637112]']
    work['provenance'] = old_work['Provenance[637113]']
    work['collection'] = old_work['Collection[637114]']
    work['last_change'] = old_work['Last Change[637115]']
    work['notes'] = old_work['Notes[637116]']
    work['rights'] = old_work['Rights[637099]']
    work['media_url'] = old_work['Media URL']
    # License and the Artstor fields are not populated
        
    works.append(work)
        
fieldnames = list(works[0].keys())
write_dicts_to_csv(works, 'gallery_works_renamed.csv', fieldnames)

print('done')

# Generate country table

Match labels in the country_culture column with labels and aliases in Wikidata for various country-like items

In [None]:
values = non_redundant(works, 'country_culture')
values.sort(key = sort_funct)
if values[0] == '':
    values.remove('')
print(values)

mappings = []
for value in values:
    result_list = searchLabelsAtWikidata(value, ['Q6256','Q3624078','Q6266'])
    print('|' + value + '|', result_list)
    if len(result_list) == 1:
        qid = result_list[0]
    elif len(result_list) > 1:
        qid = result_list
    else:
        qid = ''
    mappings.append({'string': value, 'qid': qid})
write_dicts_to_csv(mappings, 'country_mappings.csv', ['string', 'qid'])
print('done')


# Generate classes

This section first gets all of the classes (values for P31 instanceOf) from the Met's collection. Then it tries to match the labels of those items to the values in the classification column of the gallery data.

In [None]:
# get a list of all classes in the Met
work_classes = retrieve_gallery_classes()
print(work_classes)

In [None]:
values = non_redundant(works, 'classification')
values.sort(key = sort_funct)
if values[0] == '':
    values.remove('')
print(values)

mappings = []
for value in values:
    found = False
    for work_class in work_classes:
        if value.lower() == work_class['label'].lower():
            found = True
            mappings.append({'string': value, 'qid': work_class['qid'], 'label': work_class['label']})
    if not found:
        mappings.append({'string': value, 'qid': '', 'label': ''})
write_dicts_to_csv(mappings, 'classification_mappings.csv', ['string', 'qid', 'label'])
print()
print('done')


# Parse dimensions data

This processes the values of the measurements column and separates them into length and width or length/width/height quantities.

In [None]:
dimensions = {}
for work_index in range(len(works)):
#for work_index in range(0,1):
    string = works[work_index]['measurements'].strip()
    if string == '':
        # no value; set all variables to empty string
        height = ''
        width = ''
        depth = ''
        circum = ''
    elif ' x ' not in string:
        # one dimensional or improperly formatted
        pieces = string.split(' ')
        try:
            value = float(pieces[0])
            if pieces[1] != 'in.':
                # second part of string not "in."
                print(works[work_index]['accession_number'], string)
                height = ''
                width = ''
                depth = ''
                circum = ''
            else:
                height = ''
                width = ''
                depth = ''
                circum = str(value)
        except:
            # improperly formatted            
            # set all variables to empty string
            print(works[work_index]['accession_number'], string)
            height = ''
            width = ''
            depth = ''
            circum = ''
    else:
        # the string has an x in it, so it's multidimensional
        pieces = string.split('x')
        # split the string and get rid of leading and trailing whitespace
        for piece_index in range(len(pieces)):
            pieces[piece_index] = pieces[piece_index].strip()
        # remove the "in." and any spaces from the last piece
        pieces[len(pieces)-1] = pieces[len(pieces)-1].split('in')[0].strip()
        if len(pieces) == 2:
            # two-dimensional work
            try:
                height = str(float(pieces[0]))
                width = str(float(pieces[1]))
                depth = ''
                circum = ''
            except:
                print(works[work_index]['accession_number'], string)
                height = ''
                width = ''
                depth = ''
                circum = ''                
        else:
            # three-dimensional work
            try:
                height = str(float(pieces[0]))
                width = str(float(pieces[1]))
                depth = str(float(pieces[2]))
                circum = ''
            except:
                print(works[work_index]['accession_number'], string)
                height = ''
                width = ''
                depth = ''
                circum = ''
    works[work_index]['height'] = height
    works[work_index]['width'] = width
    works[work_index]['depth'] = depth
    works[work_index]['circum'] = circum

for work in works:
    print(work['height'], work['width'], work['depth'], work['circum'])

In [None]:
fieldnames = list(works[0].keys())
write_dicts_to_csv(works, 'gallery_works_with_dim.csv', fieldnames)

print('done')

The followng cell is obsolete but is being left for reference purposes. Since each line represents a separate item, it is no longer necessary to try to group lines.

In [None]:
non_redundant_works = []
last_title = ''
last_creator = ''
last_date = ''
n_non_redundant_works = 0
for redundant_work in works:
    if last_title == redundant_work['Title[637073]'] and last_creator == redundant_work['Creator[637071]']:
        # the same work as the previous row
        
        # add non-redundant fields to the existing lists
        ssid.append(redundant_work['SSID'])
        filename.append(redundant_work['Filename'])
        media_url.append(redundant_work['Media URL'])
        accession_number.append(redundant_work['Accession Number[637085]'])
        last_change.append(redundant_work['Last Change[637115]'])
        date_acquired.append(redundant_work['Date Acquired[637109]'])

        # replace the list values in every loop
        non_redundant_work['ssid'] = ssid
        non_redundant_work['filename'] = filename
        non_redundant_work['accession_number'] = accession_number
        non_redundant_work['date_acquired'] = date_acquired
        non_redundant_work['last_change'] = last_change
        non_redundant_work['media_url'] = media_url
        
        # replace the last value of the non-redundant work with the new value containing added list items
        non_redundant_works[n_non_redundant_works -1] = non_redundant_work

    else:
        # a new work
        non_redundant_work = {}
        n_non_redundant_works += 1
        
        last_title = redundant_work['Title[637073]'] 
        last_creator = redundant_work['Creator[637071]']
        
        # add non-redundant fields to new lists
        ssid = [redundant_work['SSID']]
        filename = [redundant_work['Filename']]
        accession_number = [redundant_work['Accession Number[637085]']]
        date_acquired = [redundant_work['Date Acquired[637109]']]
        last_change = [redundant_work['Last Change[637115]']]
        media_url = [redundant_work['Media URL']]
        
        # add redundant fields to work record
        non_redundant_work['title'] = redundant_work['Title[637073]'] 
        non_redundant_work['creator'] = redundant_work['Creator[637071]']
        non_redundant_work['date'] = redundant_work['Date[637076]']
        non_redundant_work['classification'] = redundant_work['Classification[637103]']
        non_redundant_work['medium'] = redundant_work['Medium[637080]']
        non_redundant_work['measurements'] = redundant_work['Measurements[637081]']
        non_redundant_work['style_period'] = redundant_work['Style/Period[637079]']
        non_redundant_work['country_culture'] = redundant_work['Country/Culture[637072]']
        non_redundant_work['seals_inscriptions'] = redundant_work['Seals & Inscriptions[637104]']
        non_redundant_work['signature'] = redundant_work['Signature[637105]']
        non_redundant_work['description'] = redundant_work['Description[637092]']
        non_redundant_work['publications'] = redundant_work['Publications[637106]']
        non_redundant_work['exhibitions'] = redundant_work['Exhibitions[637107]']
        non_redundant_work['gift_of'] = redundant_work['Gift of[637110]']
        non_redundant_work['purchased_from'] = redundant_work['Purchased from[637111]']
        non_redundant_work['credit_line'] = redundant_work['Credit Line[637112]']
        non_redundant_work['provenance'] = redundant_work['Provenance[637113]']
        non_redundant_work['collection'] = redundant_work['Collection[637114]']
        non_redundant_work['notes'] = redundant_work['Notes[637116]']
        non_redundant_work['rights'] = redundant_work['Rights[637099]']
        # License and the Artstor fields are not populated
        
        # replace the list values in every loop
        non_redundant_work['ssid'] = ssid
        non_redundant_work['filename'] = filename
        non_redundant_work['accession_number'] = accession_number
        non_redundant_work['date_acquired'] = date_acquired
        non_redundant_work['last_change'] = last_change
        non_redundant_work['media_url'] = media_url
        
        # since this is the first time for this non-redundant work, add it to the list
        non_redundant_works.append(non_redundant_work)
        
fieldnames = list(non_redundant_works[0].keys())
write_dicts_to_csv(non_redundant_works, 'gallery_works_nr.csv', fieldnames)

print('done')