This notebook is the last step before completing Phase 2 of the ACT project by uploading with VanderBot

In [None]:
import pandas as pd
from bs4 import BeautifulSoup # web-scraping library, use PIP to install beautifulsoup4 (included in Anaconda)
import requests
import json
import re # regex
from time import sleep

sparql_sleep = 0.1 # number of seconds to wait between queries to SPARQL endpoint
get_server_sleep = 0.1 # number of seconds to wait before get calls to webserver
endpoint = 'https://query.wikidata.org/sparql'
accept_media_type = 'application/json'

date_problems_frame = pd.read_csv('issues_with_inception_dates.csv', na_filter=False, dtype = str)
date_problems_list = list(date_problems_frame['act'])
creator_mismatches_frame = pd.read_csv('creator_name_mismatches.csv', na_filter=False, dtype = str)
creator_mismatches_list = list(creator_mismatches_frame['act'])
works_frame = pd.read_csv('abstract_artworks_charlotte_edits.csv', na_filter=False, dtype = str)
works_frame = works_frame.head(7).copy()
works_frame.head()

# Best to send a user-agent header because some Wikimedia servers don't like unidentified clients
def generate_header_dictionary(accept_media_type):
    user_agent_header = 'VanderBot/1.6 (https://github.com/HeardLibrary/linked-data/tree/master/vanderbot; mailto:steve.baskauf@vanderbilt.edu)'
    request_header_dictionary = {
        'Accept' : accept_media_type,
        'Content-Type': 'application/sparql-query',
        'User-Agent': user_agent_header
    }
    return request_header_dictionary

requestheader = generate_header_dictionary(accept_media_type)

def check_commons_page_for_wikidata_image_link(image_filename):
    # image_filename = 'Christ sur la mer de Galilée (Delacroix) Walters Art Museum 37.186.jpg'
    # image_filename = 'Christ_and_SocratesSAAM_1974.28.341A_B_1.jpg'
    page_url = 'https://commons.wikimedia.org/wiki/File:' + image_filename
    response = requests.get(page_url)
    
    # Create a soup object and find the file info table
    soup = BeautifulSoup(response.text, 'html.parser')
    
    image_tags = soup.findAll('img', src= 'https://upload.wikimedia.org/wikipedia/commons/thumb/f/ff/Wikidata-logo.svg/20px-Wikidata-logo.svg.png')
    if len(image_tags) > 0:
        anchor = image_tags[0].parent
        link = anchor['href']
        qid = extract_localname(link)
    else:
        qid = ''
    sleep(get_server_sleep) # Don't hit the API too fast
    return qid

def check_commons_page_for_wikidata_link(image_filename):
    # image_filename = 'Christ sur la mer de Galilée (Delacroix) Walters Art Museum 37.186.jpg'
    # image_filename = 'Christ_and_SocratesSAAM_1974.28.341A_B_1.jpg'
    page_url = 'https://commons.wikimedia.org/wiki/File:' + image_filename
    response = requests.get(page_url)
    
    # Create a soup object and find the file info table
    soup = BeautifulSoup(response.text, 'html.parser')
    tables = soup.findAll('table', class_= re.compile('fileinfotpl'))
    if len(tables) > 0:
        # Have to check for this span because there are other subtables with a tags besides the one at the top
        span = tables[0].findAll('span', id = 'artwork')
        if len(span) > 0:
            # The link to the Wikidata item will be in an href
            # Need to go up to parent, since the anchor is sometimes a sibling tag and not a child tag
            anchors = span[0].parent.findAll('a', href = re.compile('https://www.wikidata.org/wiki/'))
            if len(anchors) > 0:
                try:
                    link = anchors[0]['href']
                    qid = extract_localname(link)
                except:
                    qid = ''
            else:
                qid = ''
        else:
            qid = ''
    else:
        qid = ''
    sleep(get_server_sleep) # Don't hit the API too fast
    return qid

# function to get local name from an IRI
def extract_localname(iri):
    # with pattern like http://www.wikidata.org/entity/Q6386232 there are 5 pieces with qId as number 4
    pieces = iri.split('/')
    return pieces[len(pieces)-1] # return the last piece

def get_wikidata_item_label(qid):
    query = '''
    select distinct ?label
    where {
    wd:''' + qid + ''' rdfs:label ?label.
    filter(lang(?label)="en")
    }'''
    #print(query)

    #print('querying SPARQL endpoint to acquire item metadata')
    response = requests.post(endpoint, data=query.encode('utf-8'), headers=requestheader)
    #print(response.text)
    data = response.json()

    # extract the values from the response JSON
    results = data['results']['bindings']

    #print('done retrieving data')
    #print(json.dumps(results, indent=2))
    if len(results) > 0:
        label = results[0]['label']['value']
    else:
        label = ''
    sleep(sparql_sleep)
    return label



## Pull out rows with Commons page having link to Wikidata

There are a bunch of rows, mostly with details, where the link to Wikidata wasn't found. This checks using the new function that looks for the tiny Wikidata flag image to get the link.

In [None]:
wikidata_link_found_output_list = []
work_qid_list = []
work_label_list = []
remaing_works_list = []
for work_index, work_row in works_frame.iterrows():
    print(work_row['act'])
    
    qid = check_commons_page_for_wikidata_link(work_row['image'])
    #print(qid)

    if qid != '':
        wikidata_link_found_output_list.append(work_row)
        work_qid_list.append(qid)
        label = get_wikidata_item_label(qid)
        work_label_list.append(label)
        #print(label)
    else:
        remaing_works_list.append(work_row)
    #print()

remaing_works_frame = pd.DataFrame(remaing_works_list)

wikidata_link_found_output_frame = pd.DataFrame(wikidata_link_found_output_list)

# Stick the lists of found work Q IDs and labels onto the end of the DataFrame before saving
wikidata_link_found_output_frame['work_qid'] = work_qid_list
wikidata_link_found_output_frame['work_label'] = work_label_list

wikidata_link_found_output_frame.to_csv('wikidata_link_found.csv')
wikidata_link_found_output_frame.head()


## Pull out rows with date problems

In [None]:
date_problems_output_list = []
remaing_works_list = []
for work_index, work_row in remaing_works_frame.iterrows():
    #print(work_row['act'])
    
    if work_row['act'] in date_problems_list: # only try to match if it's on the problem list
        for date_index, date_row in date_problems_frame.iterrows():
            if date_row['act'] == work_row['act']:
                date_problems_output_list.append(work_row)
    else:
        remaing_works_list.append(work_row)

remaing_works_frame = pd.DataFrame(remaing_works_list)

date_problems_output_frame = pd.DataFrame(date_problems_output_list)
date_problems_output_frame.to_csv('date_problems.csv')
date_problems_output_frame.head()


## Pull out rows with missing creator values


In [None]:
missing_creators_output_list = []
remaing_works_list = []
for work_index, work_row in remaing_works_frame.iterrows():
    #print(work_row['act'])
    
    if work_row['creator'] == '':
        missing_creators_output_list.append(work_row)
    else:
        remaing_works_list.append(work_row)

remaing_works_frame = pd.DataFrame(remaing_works_list)

missing_creators_output_frame = pd.DataFrame(missing_creators_output_list)
missing_creators_output_frame.to_csv('missing_creators.csv')
missing_creators_output_frame.head()

## Pull out rows with creator name mismatches 

In [None]:
creator_mismatch_output_list = []
remaing_works_list = []
for work_index, work_row in remaing_works_frame.iterrows():
    #print(work_row['act'])
    
    if work_row['act'] in creator_mismatches_list: # only try to match if it's on the problem list
        for date_index, date_row in creator_mismatches_frame.iterrows():
            if date_row['act'] == work_row['act']:
                creator_mismatch_output_list.append(work_row)
    else:
        remaing_works_list.append(work_row)

remaing_works_frame = pd.DataFrame(remaing_works_list)

creator_mismatch_output_frame = pd.DataFrame(creator_mismatch_output_list)
creator_mismatch_output_frame.to_csv('creator_mismatch_problems.csv')
creator_mismatch_output_frame.head()


## Write the works that remain after screening

In [None]:
remaing_works_frame.to_csv('works_to_write.csv')
remaing_works_frame.head()
