This notebook is the last step before completing Phase 2 of the ACT project by uploading with VanderBot

In [None]:
# (c) 2022 Vanderbilt University. This program is released under a GNU General Public License v3.0 http://www.gnu.org/licenses/gpl-3.0
# Author: Steve Baskauf

import pandas as pd
from bs4 import BeautifulSoup # web-scraping library, use PIP to install beautifulsoup4 (included in Anaconda)
import requests
import json
import csv
import re # regex
from time import sleep
from fuzzywuzzy import fuzz # fuzzy string matching

import urllib.parse
import urllib.request as urlrequest
import cv2 
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline



sparql_sleep = 0.1 # number of seconds to wait between queries to SPARQL endpoint
get_server_sleep = 0.1 # number of seconds to wait before get calls to webserver
endpoint = 'https://query.wikidata.org/sparql'
accept_media_type = 'application/json'

date_problems_frame = pd.read_csv('issues_with_inception_dates.csv', na_filter=False, dtype = str)
date_problems_list = list(date_problems_frame['act'])
creator_mismatches_frame = pd.read_csv('creator_name_mismatches.csv', na_filter=False, dtype = str)
creator_mismatches_list = list(creator_mismatches_frame['act'])
works_frame = pd.read_csv('abstract_artworks_charlotte_edits.csv', na_filter=False, dtype = str)
#works_frame = works_frame.head(7).copy()
works_frame.head()

# Best to send a user-agent header because some Wikimedia servers don't like unidentified clients
def generate_header_dictionary(accept_media_type):
    user_agent_header = 'VanderBot/1.6 (https://github.com/HeardLibrary/linked-data/tree/master/vanderbot; mailto:steve.baskauf@vanderbilt.edu)'
    request_header_dictionary = {
        'Accept' : accept_media_type,
        'Content-Type': 'application/sparql-query',
        'User-Agent': user_agent_header
    }
    return request_header_dictionary

requestheader = generate_header_dictionary(accept_media_type)

# Read from a CSV file on disk into a list of dictionaries (representing a table)
def read_dicts_from_csv(filename):
    with open(filename, 'r', newline='', encoding='utf-8') as file_object:
        dict_object = csv.DictReader(file_object)
        table = []
        for row in dict_object:
            table.append(row)
    return table

# write a list of dictionaries to a CSV file
def write_dicts_to_csv(table, filename, fieldnames):
    with open(filename, 'w', newline='', encoding='utf-8') as csv_file_object:
        writer = csv.DictWriter(csv_file_object, fieldnames=fieldnames)
        writer.writeheader()
        for row in table:
            writer.writerow(row)
            

def convertToRGB(image):
    return cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

commons_prefix = 'http://commons.wikimedia.org/wiki/Special:FilePath/'
commons_page_prefix = 'https://commons.wikimedia.org/wiki/File:'
def commons_url_to_filename(url):
    # form of URL is: http://commons.wikimedia.org/wiki/Special:FilePath/Castle%20De%20Haar%20%281892-1913%29%20-%20360%C2%B0%20Panorama%20of%20Castle%20%26%20Castle%20Grounds.jpg
    string = url.split(commons_prefix)[1] # get local name file part of URL
    filename = urllib.parse.unquote(string) # reverse URL-encode the string
    return filename

def filename_to_commons_url(filename):
    encoded_filename = urllib.parse.quote(filename)
    url = commons_prefix + encoded_filename
    return url

def commons_page_url_to_filename(url):
    # form of URL is: https://commons.wikimedia.org/wiki/File:Castle_De_Haar_(1892-1913)_-_360%C2%B0_Panorama_of_Castle_%26_Castle_Grounds.jpg
    string = url.split(commons_page_prefix)[1] # get local name file part of URL
    string = string.replace('_', ' ')
    filename = urllib.parse.unquote(string) # reverse URL-encode the string
    return filename

def filename_to_commons_page_url(filename):
    filename = filename.replace(' ', '_')
    encoded_filename = urllib.parse.quote(filename)
    url = commons_page_prefix + encoded_filename
    url = url.replace('%28', '(').replace('%29', ')').replace('%2C', ',')
    return url


def check_commons_page_for_wikidata_image_link(image_filename):
    # image_filename = 'Christ sur la mer de Galilée (Delacroix) Walters Art Museum 37.186.jpg'
    # image_filename = 'Christ_and_SocratesSAAM_1974.28.341A_B_1.jpg'
    page_url = 'https://commons.wikimedia.org/wiki/File:' + image_filename
    response = requests.get(page_url)
    
    # Create a soup object and find the file info table
    soup = BeautifulSoup(response.text, 'html.parser')
    
    image_tags = soup.findAll('img', src= 'https://upload.wikimedia.org/wikipedia/commons/thumb/f/ff/Wikidata-logo.svg/20px-Wikidata-logo.svg.png')
    if len(image_tags) > 0:
        anchor = image_tags[0].parent
        link = anchor['href']
        qid = extract_localname(link)
    else:
        qid = ''
    sleep(get_server_sleep) # Don't hit the API too fast
    return qid

def check_commons_page_for_wikidata_link(image_filename):
    # image_filename = 'Christ sur la mer de Galilée (Delacroix) Walters Art Museum 37.186.jpg'
    # image_filename = 'Christ_and_SocratesSAAM_1974.28.341A_B_1.jpg'
    page_url = 'https://commons.wikimedia.org/wiki/File:' + image_filename
    response = requests.get(page_url)
    
    # Create a soup object and find the file info table
    soup = BeautifulSoup(response.text, 'html.parser')
    tables = soup.findAll('table', class_= re.compile('fileinfotpl'))
    if len(tables) > 0:
        # Have to check for this span because there are other subtables with a tags besides the one at the top
        span = tables[0].findAll('span', id = 'artwork')
        if len(span) > 0:
            # The link to the Wikidata item will be in an href
            # Need to go up to parent, since the anchor is sometimes a sibling tag and not a child tag
            anchors = span[0].parent.findAll('a', href = re.compile('https://www.wikidata.org/wiki/'))
            if len(anchors) > 0:
                try:
                    link = anchors[0]['href']
                    qid = extract_localname(link)
                except:
                    qid = ''
            else:
                qid = ''
        else:
            qid = ''
    else:
        qid = ''
    sleep(get_server_sleep) # Don't hit the API too fast
    return qid

# function to get local name from an IRI
def extract_localname(iri):
    # with pattern like http://www.wikidata.org/entity/Q6386232 there are 5 pieces with qId as number 4
    pieces = iri.split('/')
    return pieces[len(pieces)-1] # return the last piece

# Sends a query to the query service endpoint. 
# NOTE: request_header and endpoint are global variables defined earlier in the script
def send_sparql_query(query_string):
    # You can delete the two print statements if the queries are short. However, for large/long queries,
    # it's good to let the user know what's going on.
    #print('querying SPARQL endpoint to acquire item metadata')
    #response = requests.post(endpoint, data=query_string.encode('utf-8'), headers=sparql_request_header)
    no_response = True
    while no_response:
        try:
            response = requests.post(endpoint, data=query_string.encode('utf-8'), headers=requestheader)
            no_response = False
        except:
            print('Query service error. Waiting 1 minute.')
            sleep(60)
        
    #print(response.text) # uncomment to view the raw response, e.g. if you are getting an error
    data = response.json()

    # Extract the values from the response JSON
    results = data['results']['bindings']
    
    #print('done retrieving data')
    # print(json.dumps(results, indent=2))
    
    sleep(sparql_sleep) # delay to avoid hitting the Query Service too fast
    return results

def get_wikidata_item_label(qid):
    query_string = '''
    select distinct ?label
    where {
    wd:''' + qid + ''' rdfs:label ?label.
    filter(lang(?label)="en")
    }'''
    #print(query_string)
    
    results = send_sparql_query(query_string)

    #print('done retrieving data')
    #print(json.dumps(results, indent=2))
    if len(results) > 0:
        label = results[0]['label']['value']
    else:
        label = ''

    return label

def find_works_by_artists(qid_text_list):
    query_string = '''
    select distinct ?artist ?work ?workLabel
    where {
    values ?artist {''' + qid_text_list + '''}
    ?work wdt:P170 ?artist.
    ?work rdfs:label ?workLabel.
    filter(lang(?workLabel)="en")
    }'''
    #print(query_string)
    
    results = send_sparql_query(query_string)

    #print('done retrieving data')
    #print(json.dumps(results, indent=2))
    if len(results) > 0:
        artworks = []
        for result in results:
            artist_qid = extract_localname(result['artist']['value'])
            qid = extract_localname(result['work']['value'])
            label = result['workLabel']['value']
            artworks.append({'artist': artist_qid, 'work': qid, 'label': label})
    else:
        artworks = []

    return artworks


## Pull out rows with Commons page having link to Wikidata

There are a bunch of rows, mostly with details, where the link to Wikidata wasn't found. This checks using the new function that looks for the tiny Wikidata flag image to get the link.

In [None]:
wikidata_link_found_output_list = []
work_qid_list = []
work_label_list = []
remaing_works_list = []
for work_index, work_row in works_frame.iterrows():
    print(work_row['act'])
    
    qid = check_commons_page_for_wikidata_link(work_row['image'])
    #print(qid)

    if qid != '':
        wikidata_link_found_output_list.append(work_row)
        work_qid_list.append(qid)
        label = get_wikidata_item_label(qid)
        work_label_list.append(label)
        #print(label)
    else:
        remaing_works_list.append(work_row)
    #print()

remaing_works_frame = pd.DataFrame(remaing_works_list)

wikidata_link_found_output_frame = pd.DataFrame(wikidata_link_found_output_list)

# Stick the lists of found work Q IDs and labels onto the end of the DataFrame before saving
wikidata_link_found_output_frame['work_qid'] = work_qid_list
wikidata_link_found_output_frame['work_label'] = work_label_list

wikidata_link_found_output_frame.to_csv('wikidata_link_found.csv')
wikidata_link_found_output_frame.head()

print('done')


## Pull out rows with date problems

In [None]:
date_problems_output_list = []
remaing_works_list = []
for work_index, work_row in remaing_works_frame.iterrows():
    #print(work_row['act'])
    
    if work_row['act'] in date_problems_list: # only try to match if it's on the problem list
        for date_index, date_row in date_problems_frame.iterrows():
            if date_row['act'] == work_row['act']:
                date_problems_output_list.append(work_row)
    else:
        remaing_works_list.append(work_row)

remaing_works_frame = pd.DataFrame(remaing_works_list)

date_problems_output_frame = pd.DataFrame(date_problems_output_list)
date_problems_output_frame.to_csv('date_problems.csv')
date_problems_output_frame.head()


## Pull out rows with missing creator values


In [None]:
missing_creators_output_list = []
remaing_works_list = []
for work_index, work_row in remaing_works_frame.iterrows():
    #print(work_row['act'])
    
    if work_row['creator'] == '':
        missing_creators_output_list.append(work_row)
    else:
        remaing_works_list.append(work_row)

remaing_works_frame = pd.DataFrame(remaing_works_list)

missing_creators_output_frame = pd.DataFrame(missing_creators_output_list)
missing_creators_output_frame.to_csv('missing_creators.csv')
missing_creators_output_frame.head()

## Pull out rows with creator name mismatches 

In [None]:
creator_mismatch_output_list = []
remaing_works_list = []
for work_index, work_row in remaing_works_frame.iterrows():
    #print(work_row['act'])
    
    if work_row['act'] in creator_mismatches_list: # only try to match if it's on the problem list
        for date_index, date_row in creator_mismatches_frame.iterrows():
            if date_row['act'] == work_row['act']:
                creator_mismatch_output_list.append(work_row)
    else:
        remaing_works_list.append(work_row)

remaing_works_frame = pd.DataFrame(remaing_works_list)

creator_mismatch_output_frame = pd.DataFrame(creator_mismatch_output_list)
creator_mismatch_output_frame.to_csv('creator_mismatch_problems.csv')
creator_mismatch_output_frame.head()


## Write the works that remain after screening

In [None]:
remaing_works_frame.to_csv('works_to_write.csv')
remaing_works_frame.head()


## Clean labels

Remove problematic characters in the labels, check that they aren't too long

In [None]:
problematic_characters = '\\'

works_frame = pd.read_csv('abstract_artworks.csv', na_filter=False, dtype = str)
#works_frame = pd.read_csv('works_to_write.csv', na_filter=False, dtype = str)
#works_frame = works_frame.head(9).copy()
for work_index, work_row in works_frame.iterrows():
    print(work_row['act'])
    
    # Note: since work_row is basically a slice of the DataFrame, the change is made to the original DataFrame
    # Change tabs to spaces
    work_row['label_en'] = work_row['label_en'].replace('\t', ' ')
        
    # Remove problematic characters
    for problematic_character in problematic_characters:
        if problematic_character in work_row['label_en']:
            work_row['label_en'] = work_row['label_en'].replace(problematic_character, '')
    
    # Keep removing double spaces from the label until there aren't any more (works for 3x or more spaces)
    while '  ' in work_row['label_en']:
        work_row['label_en'] = work_row['label_en'].replace('  ', ' ')

    work_row['label_en'] = work_row['label_en'].strip()

    # Repeat for descriptions
    # Change tabs to spaces
    work_row['description_en'] = work_row['description_en'].replace('\t', ' ')

    for problematic_character in problematic_characters:
        if problematic_character in work_row['description_en']:
            work_row['description_en'] = work_row['description_en'].replace(problematic_character, '')
            
    while '  ' in work_row['description_en']:
        work_row['description_en'] = work_row['description_en'].replace('  ', ' ')
        
    work_row['description_en'] = work_row['description_en'].strip()
        
    # Copy the label to the title_en column
    # NOTE: a few titles aren't in English, but they can be removed manually
    # I copied over the image reference values since every items has them
    work_row['title'] = work_row['label_en']
    work_row['title_ref1_referenceUrl'] = work_row['image_ref1_referenceUrl']
    work_row['title_ref1_retrieved_val'] = work_row['image_ref1_retrieved_val']
        
    # Replace double quotes in labels with single quotes since VanderBot has problems with them.
    if '"' in work_row['label_en']:
        work_row['label_en'] = work_row['label_en'].replace('"', "'")
        
    # Limit the length of the label to 250 characters (the maximum)
    if len(work_row['label_en']) > 250:
        work_row['label_en'] = work_row['label_en'][:250]
        
works_frame.to_csv('works_to_write_out.csv', index = False)
works_frame.head(9)

## Fuzzy match the titles against works in Wikidata

There are too many artworks (especially paintings) by famous artists that are already in Wikidata, but not yet matched.

This first cell retrieves from the WD QS all works for all artists in ACT whose works will potentially be written. (There are about 35 000.)

In [None]:
works_frame = pd.read_csv('works_to_write.csv', na_filter=False, dtype = str)
artists_list = list(works_frame['creator'])
#artists_list.remove('anon') # get rid of "anon" values
artists_list = list(set(artists_list)) # get rid of redundant values
artists_string = 'wd:' + '\nwd:'.join(artists_list) # prepend "wd:" and concatenate with each Q ID on its own line
works = find_works_by_artists(artists_string)
write_dicts_to_csv(works, 'test_titles.csv', ['artist', 'work', 'label'])


Now perform fuzzy matching of our title strings against the labels for that artist in Wikidata.

In [None]:
works_frame = pd.read_csv('works_to_write.csv', na_filter=False, dtype = str)
#works_frame = works_frame.head(5).copy()
works = read_dicts_from_csv('test_titles.csv')

possible_match_list = []
for work_index, work_row in works_frame.iterrows():
    #print(work_row['label_en'])
    artist_qid = work_row['creator']
    if artist_qid != 'anon':
        work_name = work_row['label_en']
        for work in works:
            if artist_qid == work['artist']:
                test_name = work['label']
                ratio = fuzz.ratio(test_name, work_name)
                partial_ratio = fuzz.partial_ratio(test_name, work_name)
                sort_ratio = fuzz.token_sort_ratio(test_name, work_name)
                set_ratio = fuzz.token_set_ratio(test_name, work_name)
                w_ratio = fuzz.WRatio(test_name, work_name)
                if ratio > 60:
                    print('artist:', artist_qid)
                    print('act work:', work_name)
                    print(work['work'], test_name)
                    print('name similarity ratio', ratio)
                    print('partial ratio', partial_ratio)
                    print('sort_ratio', sort_ratio)
                    print('set_ratio', set_ratio)
                    print('w_ratio', w_ratio)
                    print()
                    possible_match_list.append({'score': ratio, 'artist': artist_qid, 'act': work_row['act'], 'act_title': work_name, 'match_qid': work['work'], 'match_label': test_name})
write_dicts_to_csv(possible_match_list, 'possible_matches.csv', ['score', 'artist', 'act', 'act_title', 'match_qid', 'match_label'])
print('done')

## Remove possible matches and "details" from the list of works to write

In [None]:
works_frame = pd.read_csv('works_to_write.csv', na_filter=False, dtype = str)

possible_matches_frame = pd.read_csv('possible_matches.csv', na_filter=False, dtype = str)
possible_matches_list = list(set(possible_matches_frame['act'])) # Create non-redundant list from act column

# Create new dataframe for rows where the act value is NOT in the possible matches list
remaining_works = works_frame[~works_frame['act'].isin(possible_matches_list)]

# Screen out works that contain "Detail" or "detail" in their labels (case insensitive)
details_frame = remaining_works[remaining_works['label_en'].str.contains('detail', case=False)]
details_frame.to_csv('works_that_are_details.csv', index = False)

out_frame = remaining_works[~remaining_works['label_en'].str.contains('detail', case=False)] # tilde for NOT

out_frame.to_csv('test_works_to_write.csv', index = False)
print('done')


## Script to screen missing artist works


In [None]:
missing_artists_work_frame = pd.read_csv('missing_creators.csv', na_filter=False, dtype = str)
#missing_artists_work_frame = missing_artists_work_frame.head(6).copy()

for work_index, work_row in missing_artists_work_frame.iterrows():
    print(work_row['image'])
    
    page_url = filename_to_commons_page_url(work_row['image'])
    commons_url = filename_to_commons_url(work_row['image'])
    bookmark = page_url + '#mw-imagepage-content'
    
    #get image by url and load into cv2
    resp = urlrequest.urlopen(commons_url)
    image = np.asarray(bytearray(resp.read()), dtype="uint8")
    image = cv2.imdecode(image, cv2.IMREAD_COLOR)

    plt.imshow(image)
    plt.show()
    
    print(bookmark)
    description = work_row['description_en']
    print(description)
    pieces = description.split(' by ')
    
    choice = input('')
    # Leave the way it is
    if choice == '':
        pass
    # Give only the genre
    elif choice == '0':
        work_row['description_en'] = pieces[0]
    # Leave the description, but add anonymous as creator
    elif choice == '1':
        work_row['creator'] = '_:'
        work_row['creator_object_has_role'] = 'Q4233718'
    # Try to insert artist unknown and leave photo by, add anonymous as creator
    elif choice == '2':
        if ',' in pieces[0]:
            prefix = pieces[0].split(',')[0]
        elif ';' in pieces[0]:
            prefix = pieces[0].split(';')[0]
        else:
            prefix = pieces[0]
        work_row['description_en'] = prefix + ' by artist unknown, photo by ' + pieces[1]
        work_row['creator'] = '_:'
        work_row['creator_object_has_role'] = 'Q4233718'
    # Insert that it's the photographer
    elif choice == '3':
        work_row['description_en'] = pieces[0] + ', photo by ' + pieces[1]
    # Insert that it's the photographer with artist unknown, add anonymous as creator
    elif choice == '4':
        work_row['description_en'] = pieces[0] + ' by artist unknown, photo by ' + pieces[1]
        work_row['creator'] = '_:'
        work_row['creator_object_has_role'] = 'Q4233718'
    # Give only the genre, but also add artist unknown and anonymous creator
    elif choice == '5':
        work_row['description_en'] = pieces[0] + ' by artist unknown'
        work_row['creator'] = '_:'
        work_row['creator_object_has_role'] = 'Q4233718'
    # If something else was typed in, use it as the description
    else:
        work_row['description_en'] = choice
    print()
    
    missing_artists_work_frame.to_csv('test.csv', index = False)

print('done')