In [None]:
# (c) 2022 Vanderbilt University. This program is released under a GNU General Public License v3.0 http://www.gnu.org/licenses/gpl-3.0
# Author: Steve Baskauf

from pathlib import Path
import requests
from time import sleep
import json
import urllib
import csv
import os
from fuzzywuzzy import fuzz # fuzzy logic matching
from copy import deepcopy
from langdetect import detect
from langdetect import detect_langs
import datetime
import pandas as pd
from fuzzywuzzy import fuzz # fuzzy logic matching

# ----------------
# Configuration settings
# ----------------

sparql_sleep = 0.1 # number of seconds to wait between queries to SPARQL endpoint
get_server_sleep = 0.1 # number of seconds to wait before get calls to webserver
dots_sleep = 1 # number of seconds to wait between calls to ParallelDots API
home = str(Path.home()) # gets path to home directory; supposed to work for both Win and Mac
endpoint = 'https://query.wikidata.org/sparql'
accept_media_type = 'application/json'

data_directory = '/Users/baskausj/vanderbilt/Digital Scholarship and Communications - Documents/data-curation/projects/wikidata/act/wikidata_data/'


# Calculate the reference date retrieved value for all statements
whole_time_string_z = datetime.datetime.utcnow().isoformat() # form: 2019-12-05T15:35:04.959311
dateZ = whole_time_string_z.split('T')[0] # form 2019-12-05
ref_retrieved = dateZ + 'T00:00:00Z' # form 2019-12-05T00:00:00Z as provided by Wikidata, without leading +

def extract_qnumber(iri):
    # pattern is http://www.wikidata.org/entity/Q6386232
    pieces = iri.split('/')
    return pieces[-1]

# read from a CSV file into a list of dictionaries
def read_dict(filename):
    with open(filename, 'r', newline='', encoding='utf-8') as file_object:
        dict_object = csv.DictReader(file_object)
        array = []
        for row in dict_object:
            array.append(row)
    return array

# write a list of dictionaries to a CSV file
def write_dicts_to_csv(table, filename, fieldnames):
    with open(filename, 'w', newline='', encoding='utf-8') as csv_file_object:
        writer = csv.DictWriter(csv_file_object, fieldnames=fieldnames)
        writer.writeheader()
        for row in table:
            writer.writerow(row)

def generate_header_dictionary(accept_media_type):
    user_agent_header = 'DisambituateACT/0.1 (mailto:steve.baskauf@vanderbilt.edu)'
    request_header_dictionary = {
        'Accept' : accept_media_type,
        'Content-Type': 'application/sparql-query',
        'User-Agent': user_agent_header
    }
    return request_header_dictionary

requestheader = generate_header_dictionary(accept_media_type)

def retrieve_label(qid):
    query = '''select distinct ?label where 
      {
      wd:''' + qid + '''  wdt:P170 ?artist.
      ?artist rdfs:label ?label.
      filter(lang(?label) = 'en')
      }'''

    #print(query)

    return_value = []
    #print('sending query')
    try:
        r = requests.post(endpoint, data=query.encode('utf-8'), headers=generate_header_dictionary(accept_media_type))
        #print('results returned')
        data = r.json()
    except:
        print(r.text)
        print('Failed query; waiting 10 seconds to try again.')
        print()
        sleep(10)
        label = 'fail'
    results = data['results']['bindings']
    label = ''
    try:
        for result in results:
            label = result['label']['value']
    except:
        pass

    # delay a quarter second to avoid hitting the SPARQL endpoint too rapidly
    sleep(sparql_sleep)
    
    return label

def check_for_match(test_ratio, painting, act_id, act_title, act_artist):
    result_dict = {}
    ratio = fuzz.ratio(act_title, painting['label'])
    #partial_ratio = fuzz.partial_ratio(act_title, painting['label'])
    #sort_ratio = fuzz.token_sort_ratio(act_title, painting['label'])
    #set_ratio = fuzz.token_set_ratio(act_title, painting['label'])
    if ratio >= test_ratio:
        qid = extract_qnumber(painting['item'])
        wikidata_label = 'fail'
        attempt = 0
        while wikidata_label == 'fail' and attempt < 10:
            attempt += 1
            wikidata_label = retrieve_label(qid)
        set_ratio = fuzz.token_set_ratio(act_artist, wikidata_label)
        if set_ratio >= test_ratio:
            print(ratio, painting['label'], ' / ', set_ratio, qid, wikidata_label)
            result_dict['act_id'] = act_id
            result_dict['act_title'] = act_title
            result_dict['title_match'] = ratio
            result_dict['qid'] = qid
            result_dict['wikidata_label'] = painting['label']
            result_dict['act_artist'] = act_artist
            result_dict['artist_match'] = set_ratio
            result_dict['wikidata_artist'] = wikidata_label
    return result_dict


In [None]:
# Load data
paintings_in_wikidata = pd.read_csv(data_directory + 'wikidata_paintings.csv', na_filter=False, dtype = str)
paintings_in_wikidata.head()


In [None]:
nonpaintings_in_wikidata = pd.read_csv(data_directory + 'wikidata_other_artwork_types.csv', na_filter=False, dtype = str)
nonpaintings_in_wikidata.head()


In [None]:
works_to_add = pd.read_csv('add_to_wikidata.csv', na_filter=False, dtype = str)
works_to_add.set_index('act_id', inplace=True)
#works_to_add = works_to_add.head(100) # use just the first 100 for testing
works_to_add

In [None]:
act_data = pd.read_csv('../processed_lists/act_all_202109241353_repaired.csv', na_filter=False, dtype = str)
act_data.set_index('RecordNumber', inplace=True)
act_data.head()


In [None]:
# test of fuzzy matching
first = 'Lucas, van Leyden, 1494-1533'
second = 'Lucas van Leyden'
ratio = fuzz.ratio(first, second)
partial_ratio = fuzz.partial_ratio(first, second)
sort_ratio = fuzz.token_sort_ratio(first, second)
set_ratio = fuzz.token_set_ratio(first, second)
print('ratio', ratio)
print('partial_ratio', partial_ratio)
print('sort_ratio', sort_ratio)
print('set_ratio', set_ratio)


In [None]:
# Use set ratio for artist name since it may be reversed, have dates, etc.
test_ratio = 85

output_list = []
fieldnames = ['act_id', 'act_title', 'title_match', 'qid', 'wikidata_label', 'act_artist', 'artist_match', 'wikidata_artist']
for act_id, work in works_to_add.iterrows():
    print(act_id)
    # look up informating from the ACT data file
    try: # error trap in case there is a failure to match
        act_title = act_data.loc[act_data.index == act_id, 'Title'].values[0]
        act_artist = act_data.loc[act_data.index == act_id, 'CreatorArtist'].values[0]
        act_type = act_data.loc[act_data.index == act_id, 'ObjectFunction'].values[0]
    except:
        continue # skip this work and go to the next one
    print(act_title, '/', act_type, '/', act_artist)
    if act_type == 'Painting':
        for index, painting in paintings_in_wikidata.iterrows():
            result_dict = check_for_match(test_ratio, painting, act_id, act_title, act_artist)
            if result_dict != {}:
                output_list.append(result_dict)
                write_dicts_to_csv(output_list, 'artwork_matches.csv', fieldnames)
                
    elif act_type == '':
        for index, painting in paintings_in_wikidata.iterrows():
            result_dict = check_for_match(test_ratio, painting, act_id, act_title, act_artist)
            if result_dict != {}:
                output_list.append(result_dict)
                write_dicts_to_csv(output_list, 'artwork_matches.csv', fieldnames)
                
        for index, nonpainting in nonpaintings_in_wikidata.iterrows():
            result_dict = check_for_match(test_ratio, nonpainting, act_id, act_title, act_artist)
            if result_dict != {}:
                output_list.append(result_dict)
                write_dicts_to_csv(output_list, 'artwork_matches.csv', fieldnames)
                
    else:
        for index, nonpainting in nonpaintings_in_wikidata.iterrows():
            result_dict = check_for_match(test_ratio, nonpainting, act_id, act_title, act_artist)
            if result_dict != {}:
                output_list.append(result_dict)
                write_dicts_to_csv(output_list, 'artwork_matches.csv', fieldnames)
                
    print()

print('done')