# Compare data dumped from ACT database to what's in Commons

This is initially to create artwork items for ACT items attached to a non-artwork item in Wikidata.

## Function section

This needs to be run before any of the other cells


In [None]:
# (c) 2022 Vanderbilt University. This program is released under a GNU General Public License v3.0 http://www.gnu.org/licenses/gpl-3.0
# Author: Steve Baskauf

from pathlib import Path
import requests
from time import sleep
import json
import urllib
import csv
import os
from fuzzywuzzy import fuzz # fuzzy logic matching
from copy import deepcopy
from langdetect import detect
from langdetect import detect_langs
import datetime

import re # regex
from bs4 import BeautifulSoup # web-scraping library, use PIP to install beautifulsoup4 (included in Anaconda)

import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk import RegexpParser

# ----------------
# Configuration settings
# ----------------

sparql_sleep = 0.1 # number of seconds to wait between queries to SPARQL endpoint
get_server_sleep = 0.1 # number of seconds to wait before get calls to webserver
dots_sleep = 1 # number of seconds to wait between calls to ParallelDots API
home = str(Path.home()) # gets path to home directory; supposed to work for both Win and Mac
endpoint = 'https://query.wikidata.org/sparql'
accept_media_type = 'application/json'

# Calculate the reference date retrieved value for all statements
whole_time_string_z = datetime.datetime.utcnow().isoformat() # form: 2019-12-05T15:35:04.959311
dateZ = whole_time_string_z.split('T')[0] # form 2019-12-05
ref_retrieved = dateZ + 'T00:00:00Z' # form 2019-12-05T00:00:00Z as provided by Wikidata, without leading +

# ----------------
# Utility functions
# ----------------

# Best to send a user-agent header because some Wikimedia servers don't like unidentified clients
def generate_header_dictionary(accept_media_type):
    user_agent_header = 'VanderBot/1.6 (https://github.com/HeardLibrary/linked-data/tree/master/vanderbot; mailto:steve.baskauf@vanderbilt.edu)'
    request_header_dictionary = {
        'Accept' : accept_media_type,
        'Content-Type': 'application/sparql-query',
        'User-Agent': user_agent_header
    }
    return request_header_dictionary

requestheader = generate_header_dictionary(accept_media_type)

# read from a CSV file into a list of dictionaries
def read_dict(filename):
    with open(filename, 'r', newline='', encoding='utf-8') as file_object:
        dict_object = csv.DictReader(file_object)
        array = []
        for row in dict_object:
            array.append(row)
    return array

# write a list of dictionaries to a CSV file
def write_dicts_to_csv(table, filename, fieldnames):
    with open(filename, 'w', newline='', encoding='utf-8') as csv_file_object:
        writer = csv.DictWriter(csv_file_object, fieldnames=fieldnames)
        writer.writeheader()
        for row in table:
            writer.writerow(row)

def load_credential(filename, directory):
    cred = ''
    # to change the script to look for the credential in the working directory, change the value of home to empty string
    if directory == 'home':
        home = str(Path.home()) #gets path to home directory; works for both Win and Mac
        credential_path = home + '/' + filename
    else:
        directory = 'working'
        credential_path = filename
    try:
        with open(credential_path, 'rt', encoding='utf-8') as file_object:
            cred = file_object.read()
    except:
        print(filename + ' file not found - is it in your ' + directory + ' directory?')
        exit()
    return(cred)

# find non-redundant values for a column or simple list
def non_redundant(table, column_key):
    non_redundant_list = []
    for row in table:
        found = False
        for test_item in non_redundant_list:
            if column_key == '':
                if row == test_item:
                    found = True
                    break
            else:
                if row[column_key] == test_item:
                    found = True
                    break
        if not found:
            if column_key == '':
                non_redundant_list.append(row)
            else:
                non_redundant_list.append(row[column_key])
    return non_redundant_list

# function to use in sort of simple list
def sort_funct(row):
    return row

# function to use in sort last_first names
def sort_last_first(row):
    return row['last_first']

# function to use in sort by match score
def sort_score(row):
    return row['score']

# function to get local name from an IRI
def extract_localname(iri):
    # with pattern like http://www.wikidata.org/entity/Q6386232 there are 5 pieces with qId as number 4
    pieces = iri.split('/')
    return pieces[len(pieces)-1] # return the last piece

# search label and alias
# For whatever reason, if I use the graph pattern

# wd:Q21 wdt:P31 ?class.

# England is not Q6256 (country)
# But if I use the graph pattern

#   wd:Q21 p:P31 ?statement.
#  ?statement ps:P31 ?class.

# it is ??!!
def searchLabelsAtWikidata(string, class_list):
    # create a string for the query
    query = 'select distinct ?id '
    query += '''where {
  {?id rdfs:label "''' + string + '''"@en.}
  union
  {?id skos:altLabel "''' + string + '''"@en.}
  '''
    for class_index in range(len(class_list)):
        if class_index == 0:
            query += '''{?id p:P31 ?statement.
  ?statement ps:P31 wd:''' + class_list[class_index] + '''.}
  '''
        else:
            query += '''union
  {?id p:P31 ?statement.
  ?statement ps:P31 wd:''' + class_list[class_index] + '''.}
  '''
    query += '''}'''
    #print(query)

    return_value = []
    # r = requests.get(endpointUrl, params={'query' : query}, headers=requestHeaderDictionary)
    r = requests.post(endpoint, data=query.encode('utf-8'), headers=generate_header_dictionary(accept_media_type))
    data = r.json()
    results = data['results']['bindings']
    for result in results:
        qid = extract_localname(result['id']['value'])
        return_value.append(qid)

    # delay a quarter second to avoid hitting the SPARQL endpoint too rapidly
    sleep(sparql_sleep)
    
    return return_value

def retrieve_gallery_classes():
    # create a string for the query
    # use Metropolitan Museum of Art because there are too many collections to not specify the collection.
    query = '''select distinct ?class ?label where 
      {
      ?item wdt:P195 wd:Q160236.
      ?item wdt:P31 ?class.
      ?class rdfs:label ?label.
      filter(lang(?label) = 'en')
      }
      order by ?label'''

    #print(query)

    return_value = []
    print('sending query')
    r = requests.post(endpoint, data=query.encode('utf-8'), headers=generate_header_dictionary(accept_media_type))
    print('results returned')
    data = r.json()
    results = data['results']['bindings']
    for result in results:
        qid = extract_localname(result['class']['value'])
        label = result['label']['value']
        return_value.append({'label': label, 'qid': qid})

    # delay a quarter second to avoid hitting the SPARQL endpoint too rapidly
    sleep(sparql_sleep)
    
    return return_value

def generateNameAlternatives(name):
    # treat commas as if they were spaces
    name = name.replace(',', ' ')
    # get rid of periods
    name = name.replace('.', '')

    pieces = name.split(' ')
    
    # Remove ", Jr.", "III", etc. from end of name
    if pieces[len(pieces)-1] == 'Jr':
        pieces = pieces[0:len(pieces)-1]
        suffix = ', Jr.'
    elif pieces[len(pieces)-1] == 'II':
        pieces = pieces[0:len(pieces)-1]
        suffix = ' II'
    elif pieces[len(pieces)-1] == 'III':
        pieces = pieces[0:len(pieces)-1]
        suffix = ' III'
    elif pieces[len(pieces)-1] == 'IV':
        pieces = pieces[0:len(pieces)-1]
        suffix = ' IV'
    elif pieces[len(pieces)-1] == 'V':
        pieces = pieces[0:len(pieces)-1]
        suffix = ' V'
    elif len(pieces) > 3 and pieces[len(pieces)-2] == 'the' and pieces[len(pieces)-1] == 'elder':
        pieces = pieces[0:len(pieces)-2]
        suffix = ' the elder'
    else:
        suffix = ''

    # generate initials for all names
    initials = []
    for piece in pieces:
        # make sure first character is alphabetic
        # only fixes the case where there is one alphanumeric, but more than one is rare
        # typical cases are like (Kit) or "Kit"
        if not piece[0:1].isalpha():
            piece = piece[1:len(piece)] # remove the first non-alphabetic character
        if len(piece) > 0:
            initials.append(piece[0:1])
        
    alternatives = []
    # full name
    nameVersion = ''
    for pieceNumber in range(0, len(pieces)-1):
        nameVersion += pieces[pieceNumber] + ' '
    nameVersion += pieces[len(pieces)-1]
    alternatives.append(nameVersion)
    
    # full name with suffix
    if suffix != '':
        nameVersion = ''
        for pieceNumber in range(0, len(pieces)-1):
            nameVersion += pieces[pieceNumber] + ' '
        nameVersion += pieces[len(pieces)-1] + suffix
        alternatives.append(nameVersion)
    
    # first and last name with initials
    nameVersion = pieces[0] + ' '
    for pieceNumber in range(1, len(pieces)-1):
        nameVersion += initials[pieceNumber] + ' '
    nameVersion += pieces[len(pieces)-1]
    alternatives.append(nameVersion)
    
    # first and last name with initials and periods
    nameVersion = pieces[0] + ' '
    for pieceNumber in range(1, len(pieces)-1):
        nameVersion += initials[pieceNumber] + '. '
    nameVersion += pieces[len(pieces)-1]
    alternatives.append(nameVersion)

    # first and last name only
    nameVersion = pieces[0] + ' '
    nameVersion += pieces[len(pieces)-1]
    alternatives.append(nameVersion)

    # first initial and last name only
    nameVersion = initials[0] + ' '
    nameVersion += pieces[len(pieces)-1]
    alternatives.append(nameVersion)

    # first initial with period and last name only
    nameVersion = initials[0] + '. '
    nameVersion += pieces[len(pieces)-1]
    alternatives.append(nameVersion)

    # all name initials with last name
    nameVersion = initials[0] + ' '
    for pieceNumber in range(1, len(pieces)-1):
        nameVersion += initials[pieceNumber] + ' '
    nameVersion += pieces[len(pieces)-1]
    alternatives.append(nameVersion)

    # all name initials with periods with last name
    nameVersion = ''
    for pieceNumber in range(0, len(pieces)-1):
        nameVersion += initials[pieceNumber] + '. '
    nameVersion += pieces[len(pieces)-1]
    alternatives.append(nameVersion)

    # all name initials concatenated with last name
    nameVersion = ''
    for pieceNumber in range(0, len(pieces)-1):
        nameVersion += initials[pieceNumber]
    nameVersion += ' ' + pieces[len(pieces)-1]
    alternatives.append(nameVersion)
    
    # remove duplicates
    dedupe = list(set(alternatives))

    return dedupe

def searchNameAtWikidata(name):
    nameList = generateNameAlternatives(name)
    alternatives = ''
    for alternative in nameList:
        # get rid of quotes, which will break the query
        alternative = alternative.replace('"', '')
        alternative = alternative.replace("'", '')
        alternatives += '"' + alternative + '"@en\n'
    query = '''
select distinct ?item ?label where {
  VALUES ?value
  {
  ''' + alternatives + '''}
?item rdfs:label|skos:altLabel ?value.
?item rdfs:label ?label.
FILTER(lang(?label)='en')
  }
'''
    #print(query)
    #print('searching for ', name)
    results = []
    # r = requests.get(wikidataEndpointUrl, params={'query' : query}, headers=requestHeaderDictionary)
    r = requests.post(endpoint, data=query.encode('utf-8'), headers=requestheader)
    try:
        data = r.json()
        statements = data['results']['bindings']
        for statement in statements:
            wikidataIri = statement['item']['value']
            if 'label' in statement:
                name = statement['label']['value']
            else:
                name = ''
            qNumber = extract_localname(wikidataIri)
            results.append({'qId': qNumber, 'name': name})
    except:
        results = [{'error': r.text}]
    # delay a quarter second to avoid hitting the SPARQL endpoint too rapidly
    sleep(sparql_sleep)
    return results

def name_variant_testing(name, variant):
    # get rid of periods
    name = name.replace('.','')
    variant = variant.replace('.','')
    
    # create first names
    name_pieces = name.split(' ')
    variant_pieces = variant.split(' ')
    last_name = name_pieces[len(name_pieces)-1]
    last_variant = variant_pieces[len(variant_pieces)-1]
    if len(name_pieces) > 1:
        first_names = name[0:-(len(last_name)+1)]
    else:
        first_names = name     
    if len(variant_pieces) > 1:
        first_variants = variant[0:-(len(last_variant)+1)]
    else:
        first_variants = variant      
    #print(first_names)
    #print(first_variants)
    
    # compare first names
    # I experimented with the different ratios and I think fuzz might be best.
    ratio = fuzz.ratio(first_names, first_variants)
    #partial_ratio = fuzz.partial_ratio(first_names, first_variants)
    #sort_ratio = fuzz.token_sort_ratio(first_names, first_variants)
    #set_ratio = fuzz.token_set_ratio(first_names, first_variants)
    # print('name similarity ratio', ratio)
    #print('partial ratio', partial_ratio)
    #print('sort_ratio', sort_ratio)
    #print('set_ratio', set_ratio)

    return(ratio)

def find_surname_givens(name):
    # Get rid of periods and commas
    name = name.replace('.', '')
    name = name.replace(',', '')
    
    # Split name
    pieces = name.split(' ')
    # Must be at least a surname and something else
    if len(pieces) <= 1:
        return False
    
    # Make sure first character is alphabetic
    # only fixes the case where there is one alphanumeric, but more than one is rare
    # typical cases are like (Kit) or "Kit"    
    for piece_index in range(len(pieces)):
        if not pieces[piece_index][0:1].isalpha(): 
            pieces[piece_index] = pieces[piece_index][1:len(pieces)] # remove the first non-alphabetic character
    # Now get rid of any empty strings; could also be caused by double spaces
    for piece in pieces:
        if len(piece) == 0: # there's nothing left, get rid of piece
            pieces.remove('')
            
    # Get rid of ", Jr.", "III", etc.
    if 'Jr' in pieces:
        pieces.remove('Jr')
    if 'Sr' in pieces:
        pieces.remove('Sr')
    if 'II' in pieces:
        pieces.remove('II')
    if 'III' in pieces:
        pieces.remove('III')
    if 'IV' in pieces:
        pieces.remove('IV')
    if 'V' in pieces:
        pieces.remove('V')
    
    # Not interested unless there are at least two pieces
    if len(pieces) == 1:
        return False
    
    # Put all but last piece together again
    given_names = ''
    for piece in pieces[0:len(pieces)-2]:
        given_names += piece + ' '
    given_names += pieces[len(pieces)-2]
    
    return {'given': given_names, 'family': pieces[len(pieces)-1]}

def remove_parens(string):
    name_string = string.split('(')[0]
    return name_string.strip()

def remove_description(string):
    try:
        right_string = string.split('(')[1]
        left_string = right_string.split(')')[0]
        result = left_string.strip()
    except:
        result = ''
    return result

def reverse_names(string):
    pieces = string.split(',')
    return pieces[1].strip() + ' ' + pieces[0].strip()

# Screens for Wikidata items that are potential matches

import vb_common_code as vbc
retrieve_class_list_query = vbc.Query(pid='P31', uselabel=False, sleep=sparql_sleep)
retrieve_birth_date_query = vbc.Query(isitem=False, pid='P569', sleep=sparql_sleep)
retrieve_death_date_query = vbc.Query(isitem=False, pid='P570', sleep=sparql_sleep)

def human(qId):
    screen = True
    wdClassList = retrieve_class_list_query.single_property_values_for_item(qId)
    # if there is a class property, check if it's a human
    if len(wdClassList) != 0:
        # if it's not a human
        if wdClassList[0] != 'Q5':
            #print('*** This item is not a human!')
            screen = False
    return screen

# returns a dictionary of various descriptors of the item with Wikidata ID qId
# P106 is occupation, schema:description is filtered to be the English description
def searchWikidataDescription(qId):
    resultsDict = {}
    query = '''select distinct ?description ?orcid ?occupation where {
        optional {
            wd:'''+ qId + ''' schema:description ?description.
            FILTER(lang(?description) = 'en')
            }
        optional {
            wd:'''+ qId + ''' wdt:P106 ?occupationId.
            ?occupationId rdfs:label ?occupation.
            FILTER(lang(?occupation) = 'en')            
            }
        optional {wd:'''+ qId + ''' wdt:P496 ?orcid.}
      }'''
    #print(query)
    r = requests.post(endpoint, data=query.encode('utf-8'), headers=requestheader)
    try:
        data = r.json()
        statements = data['results']['bindings']
        if len(statements) > 0: # if no results, the dictionary remains empty
            # Only a single description per language is allowed, so there should only be one description
            if 'description' in statements[0]:
                description = statements[0]['description']['value']
            else:
                description = ''
            resultsDict['description'] = description
            
            # Only a single ORCID is allowed, so there should only be one orcid value
            if 'orcid' in statements[0]:
                orcid = statements[0]['orcid']['value']
            else:
                orcid = ''
            resultsDict['orcid'] = orcid
            
            # if there are multiple statements, that's because there are more than one occupation
            occupationList = []
            for statement in statements:
                if 'occupation' in statement:
                    occupationList.append(statement['occupation']['value'])
            resultsDict['occupation'] = occupationList
    except:
        resultsDict = {'error': r.text}
    # delay a quarter second to avoid hitting the SPARQL endpoint too rapidly
    sleep(sparql_sleep)
    return resultsDict

def determine_era(string):
    # dates with no CE or BCE, including empty string
    if 'CE' not in string:
        value = string
        era = 'unknown'      
    else:
        if 'BCE' in string:
            value = string[0:len(string)-3].strip()
            era = 'BCE'
        else: # string ends with CE
            value = string[0:len(string)-2].strip()
            era = 'CE'
    return value, era

def determine_zeros(date):
    zero_count = 0
    for char_number in range(len(date), 0, -1):
        if date[char_number-1] == '0':
            zero_count += 1
        else:
            return zero_count

def pad_zeros_left(date_string):
    length = len(date_string)
    pad = 4-length
    return '0' * pad + date_string

def sign(era):
    if era == 'BCE':
        return '-'
    elif era == 'CE':
        return ''
    else:
        return ''

def detect_language(string):
    try:
        lang_list = detect_langs(string)
        lang_string = str(lang_list[0])
        confidence = float(lang_string[3:])
        lang = lang_string[:2]
    except: #exceptions occur when no info to decide, e.g. numbers
        lang = 'zxx'
        confidence = float(0)
    return lang, confidence

commons_prefix = 'http://commons.wikimedia.org/wiki/Special:FilePath/'
commons_page_prefix = 'https://commons.wikimedia.org/wiki/File:'
def commons_url_to_filename(url):
    # form of URL is: http://commons.wikimedia.org/wiki/Special:FilePath/Castle%20De%20Haar%20%281892-1913%29%20-%20360%C2%B0%20Panorama%20of%20Castle%20%26%20Castle%20Grounds.jpg
    string = url.split(commons_prefix)[1] # get local name file part of URL
    filename = urllib.parse.unquote(string) # reverse URL-encode the string
    return filename

def filename_to_commons_url(filename):
    encoded_filename = urllib.parse.quote(filename)
    url = commons_prefix + encoded_filename
    return url

def commons_page_url_to_filename(url):
    # form of URL is: https://commons.wikimedia.org/wiki/File:Castle_De_Haar_(1892-1913)_-_360%C2%B0_Panorama_of_Castle_%26_Castle_Grounds.jpg
    string = url.split(commons_page_prefix)[1] # get local name file part of URL
    string = string.replace('_', ' ')
    filename = urllib.parse.unquote(string) # reverse URL-encode the string
    return filename

def filename_to_commons_page_url(filename):
    filename = filename.replace(' ', '_')
    encoded_filename = urllib.parse.quote(filename)
    url = commons_page_prefix + encoded_filename
    url = url.replace('%28', '(').replace('%29', ')').replace('%2C', ',')
    return url

def retrieve_gallery_classes():
    # Find the Q IDs of works with ACT identifiers that do not have one of the artwork types
    query = '''select distinct ?item ?actId ?classLabel ?itemLabel ?image where {
  ?item wdt:P9092 ?actId.
  ?item wdt:P18 ?image.
  optional {
    ?item rdfs:label ?itemLabel.
    filter(lang(?itemLabel)="en")
    }
  ?item wdt:P31 ?class.
  ?class rdfs:label ?classLabel.
  filter(lang(?classLabel)="en")
  minus {?item wdt:P31 wd:Q3305213.} # painting
  minus {?item wdt:P31 wd:Q15711026.} # altarpiece  
  minus {?item wdt:P31 wd:Q93184.} # drawing
  minus {?item wdt:P31 wd:Q22669139.} # fresco
  minus {?item wdt:P31 wd:Q15123870.} # lithograph
  minus {?item wdt:P31 wd:Q8362.} # miniature
  minus {?item wdt:P31 wd:Q133067.} # mosaic
  minus {?item wdt:P31 wd:Q219423.} # mural
  minus {?item wdt:P31 wd:Q125191.} # photograph
  minus {?item wdt:P31 wd:Q11060274.} # print
  minus {?item wdt:P31 wd:Q1064538.} # quilt
  minus {?item wdt:P31 wd:Q245117.} # relief sculpture
  minus {?item wdt:P31 wd:Q860861.} # sculpture
  minus {?item wdt:P31 wd:Q2282251.} # seven-branched candlestick
  minus {?item wdt:P31 wd:Q1473346.} # stained glass
  minus {?item wdt:P31 wd:Q179700.} # statue
  minus {?item wdt:P31 wd:Q18761202.} # watercolor painting
  minus {?item wdt:P31 wd:Q48498.} # illuminated manuscript
  minus {?item wdt:P31 wd:Q184296.} # tapestry
  minus {?item wdt:P31 wd:Q811979.} # architectural structure
  minus {?item wdt:P31 wd:Q1278452.} # polyptych
  minus {?item wdt:P31 wd:Q15727816.} # painting series
  minus {?item wdt:P31 wd:Q16744570.} # tablet
  minus {?item wdt:P31 wd:Q87167.} # manuscript
  minus {?item wdt:P31 wd:Q16905563.} # cycle of paintings
  minus {?item wdt:P31 wd:Q132137.} # icon
  minus {?item wdt:P31 wd:Q18887969.} # copper engraving print
  minus {?item wdt:P31 wd:Q28823.} # textile
  minus {?item wdt:P31 wd:Q2293362.} # sculptural group
  }
#order by ?classLabel
'''
    #print(query)

    return_value = []
    print('sending query')
    r = requests.post(endpoint, data=query.encode('utf-8'), headers=generate_header_dictionary(accept_media_type))
    print('results returned')
    #print(r.text)
    data = r.json()
    results = data['results']['bindings']
    for result in results:
        qid = extract_localname(result['item']['value'])
        if 'itemLabel' in result:
            label = result['itemLabel']['value']
        else:
            label = ''
        class_label = result['classLabel']['value']
        act_id = result['actId']['value']
        image = result['image']['value']
        return_value.append({'qid': qid, 'act_id': act_id, 'class_label': class_label, 'label': label, 'image': commons_url_to_filename(image)})

    # delay a quarter second to avoid hitting the SPARQL endpoint too rapidly
    sleep(sparql_sleep)
    
    return return_value

def check_commons_page_for_wikidata_link(image_filename):
    # image_filename = 'Christ sur la mer de Galilée (Delacroix) Walters Art Museum 37.186.jpg'
    # image_filename = 'Christ_and_SocratesSAAM_1974.28.341A_B_1.jpg'
    page_url = 'https://commons.wikimedia.org/wiki/File:' + image_filename
    response = requests.get(page_url)
    
    # Create a soup object and find the file info table
    soup = BeautifulSoup(response.text, 'html.parser')
    tables = soup.findAll('table', class_= re.compile('fileinfotpl'))
    if len(tables) > 0:
        # Have to check for this span because there are other subtables with a tags besides the one at the top
        span = tables[0].findAll('span', id = 'artwork')
        if len(span) > 0:
            # The link to the Wikidata item will be in an href
            # Need to go up to parent, since the anchor is sometimes a sibling tag and not a child tag
            anchors = span[0].parent.findAll('a', href = re.compile('https://www.wikidata.org/wiki/'))
            print(anchors)
            if len(anchors) > 0:
                try:
                    link = anchors[0]['href']
                    qid = extract_localname(link)
                except:
                    qid = ''
            else:
                qid = ''
        else:
            qid = ''
    else:
        qid = ''
    sleep(get_server_sleep) # Don't hit the API too fast
    return qid

def check_commons_page_for_wikidata_image_link(image_filename):
    # image_filename = 'Christ sur la mer de Galilée (Delacroix) Walters Art Museum 37.186.jpg'
    # image_filename = 'Christ_and_SocratesSAAM_1974.28.341A_B_1.jpg'
    page_url = 'https://commons.wikimedia.org/wiki/File:' + image_filename
    response = requests.get(page_url)
    
    # Create a soup object and find the file info table
    soup = BeautifulSoup(response.text, 'html.parser')
    
    image_tags = soup.findAll('img', src= 'https://upload.wikimedia.org/wikipedia/commons/thumb/f/ff/Wikidata-logo.svg/20px-Wikidata-logo.svg.png')
    if len(image_tags) > 0:
        anchor = image_tags[0].parent
        link = anchor['href']
        qid = extract_localname(link)
    else:
        qid = ''
    sleep(get_server_sleep) # Don't hit the API too fast
    return qid


# Script to find out if works have items under a different image

Check the Commons page to see if there is a Wikidata link pointing to a different item

In [None]:
hits = retrieve_gallery_classes()
print('Items found:', len(hits))
print()

filename = 'works_already_in_wikidata.csv'
works = read_dict(filename)

results_list = []
for hit in hits:
    result_dict = {'bad_qid': hit['qid'], 'bad_label': hit['label']}
    # Look up hit in list of works already in Wikidata
    found = False
    for work in works:
        if hit['act_id'] == work['RecordNumber']:
            result_dict['filename'] = work['filename']
            found = True
            
            # Check the Commons web page for a Wikidata link
            qid = check_commons_page_for_wikidata_link(work['filename'])
            if qid != '':
                result_dict['good_qid'] = qid
                results_list.append(result_dict)
            else:
                qid = 'no link'
            print(hit['qid'], hit['label'], qid)           
            break
    if not found:
        print(hit['qid'], 'not found in ACT database. ', hit['label'])
       
#print(results_list)
fieldnames = ['bad_qid', 'bad_label', 'filename', 'good_qid']
write_dicts_to_csv(results_list, 'wikidata_items_to_check.csv', fieldnames)

print('done')


The following is a hack of above script used after adding abstract artworks that were missing

In [None]:
hits = retrieve_gallery_classes()
#print(json.dumps(hits, indent=2))
print('Items found:', len(hits))
print()


filename = 'works_already_in_wikidata.csv'
works = read_dict(filename)

results_list = []
for hit in hits:
    result_dict = {'bad_qid': hit['qid'], 'bad_class': hit['class_label'], 'bad_label': hit['label'], 'image': hit['image']}
    # Look up hit in list of works already in Wikidata
    found = False
    for work in works:
        if hit['act_id'] == work['RecordNumber']:
            result_dict['good_qid'] = work['qid']
            found = True
            break
    results_list.append(result_dict)
       
#print(results_list)
fieldnames = ['bad_qid', 'bad_class', 'good_qid', 'bad_label', 'image']
write_dicts_to_csv(results_list, 'wikidata_items_to_delete_act.csv', fieldnames)

print('done')


# Script to pull data for works needing processing

Run after running functions cell above

In [None]:
hits = retrieve_gallery_classes()
print('Items found:', len(hits))
print()

filename = 'works_already_in_wikidata.csv'
works = read_dict(filename)

filename = 'act_all_202109241353_repaired.csv'
act_metadata = read_dict(filename)

filename = 'already_in_templated_data.csv'
commons_metadata = read_dict(filename)

act_database_list = []
commons_scrape_list = []

for hit in hits:
    
    # Look up hit in the ACT metadata to store in CSV
    found = False
    for work in act_metadata:
        if hit['act_id'] == work['RecordNumber']:
            print(hit['qid'], hit['label'])
            found = True
            act_database_list.append(work)
            break
    if not found:
        print(hit['qid'], 'not found in ACT database. ', hit['label'])

    # Look up the hit in the list of works already in Wikidata to get the filename
    found = False
    for work in works:
        if hit['act_id'] == work['RecordNumber']:
            filename = work['filename']
            found = True
    if not found:
        print('could not find in list of works already in Wikidata')
        
    if found:
        label = work['filename']
        print('image filename:', label)

        found = False
        for commons_item in commons_metadata:
            if commons_item['filename'] == filename:
                found = True
                commons_scrape_list.append(commons_item)
                #artist_data = json.loads(commons_item['artist'])
                #author_data = json.loads(commons_item['author'])
                #try:
                #    if artist_data ==[]:
                #        print('Commons artist:', author_data[0])
                #    else:
                #        print('Commons artist:', artist_data[0])
                #except:
                #    print('No artist information in Commons.')
                break
        if not found:
            print('Not found in Commons database.')
        
    print()
    
#print(json.dumps(works_list, indent = 2))
fieldnames = act_database_list[0].keys()
write_dicts_to_csv(act_database_list, 'act_data_fix.csv', fieldnames)

fieldnames = commons_scrape_list[0].keys()
write_dicts_to_csv(commons_scrape_list, 'commons_data_fix.csv', fieldnames)

print('done')


# Hack of above script to process works in Commons and known to not be in Wikidata

In [None]:
# Do not need to look up ACT IDs in Wikidata because the works aren't there.

#hits = retrieve_gallery_classes()
#print('Items found:', len(hits))
#print()

# Commons works to generate metadata for are in this file
filename = '../create_items/add_to_wikidata.csv'
works_to_add = read_dict(filename)

filename = 'act_all_202109241353_repaired.csv'
act_metadata = read_dict(filename)

filename = 'already_in_templated_data.csv'
commons_metadata = read_dict(filename)

act_database_list = []
commons_scrape_list = []

for work_to_add in works_to_add:
    
    print('image filename:', work_to_add['filename'])

    # Look up hit in the ACT metadata to store in CSV
    found = False
    for work in act_metadata:
        if work_to_add['act_id'] == work['RecordNumber']:
            found = True
            act_database_list.append(work)
            break
    if not found:
        print(work_to_add['act_id'], 'not found in ACT database. ')

    found = False
    for commons_item in commons_metadata:
        if commons_item['filename'] == work_to_add['filename']:
            found = True
            commons_scrape_list.append(commons_item)
            #artist_data = json.loads(commons_item['artist'])
            #author_data = json.loads(commons_item['author'])
            #try:
            #    if artist_data ==[]:
            #        print('Commons artist:', author_data[0])
            #    else:
            #        print('Commons artist:', artist_data[0])
            #except:
            #    print('No artist information in Commons.')
            break
    if not found:
        print('Not found in Commons database.')
        
    print()
    
#print(json.dumps(works_list, indent = 2))
fieldnames = act_database_list[0].keys()
write_dicts_to_csv(act_database_list, 'act_data_fix.csv', fieldnames)

fieldnames = commons_scrape_list[0].keys()
write_dicts_to_csv(commons_scrape_list, 'commons_data_fix.csv', fieldnames)

print('done')
