In [None]:
# commons_data.ipynb, a Python script for downloading and scraping data from Wikimedia Commons
version = '0.1'
created = '2021-10-29'

# (c) 2021 Vanderbilt University. This program is released under a GNU General Public License v3.0 http://www.gnu.org/licenses/gpl-3.0
# Author: Steve Baskauf

# Generic Commons API reference: https://commons.wikimedia.org/w/api.php
# NOTE: See the commonsbot code for more information about the various forms of identifiers used for Commons files.

# Common code
# ----------------

# This section contains import statements and function definitions.
# It should be run before running other sections of the code

import json
import requests
import csv
from pathlib import Path
from time import sleep
import sys
import re # regex
from bs4 import BeautifulSoup # web-scraping library, use PIP to install beautifulsoup4 (included in Anaconda)

read_api_sleep = 0.1 # delay to throttle the script and not hit the server too fast

# ------------------------
# function definitions
# ------------------------

# Utility functions

# gunction to get local name from an IRI
def extract_localname(iri):
    # with pattern like http://www.wikidata.org/entity/Q6386232 there are 5 pieces with qId as number 4
    pieces = iri.split('/')
    return pieces[len(pieces)-1] # return the last piece

# read from a CSV file into a list of dictionaries
def read_dict(filename):
    with open(filename, 'r', newline='', encoding='utf-8') as file_object:
        dict_object = csv.DictReader(file_object)
        array = []
        for row in dict_object:
            array.append(row)
    return array

# write a list of dictionaries to a CSV file
def write_dicts_to_csv(table, filename, fieldnames):
    with open(filename, 'w', newline='', encoding='utf-8') as csv_file_object:
        writer = csv.DictWriter(csv_file_object, fieldnames=fieldnames)
        writer.writeheader()
        for row in table:
            writer.writerow(row)


In [None]:
# -------------------
# Retrieving image data from the Commons MediaWiki API
# -------------------

# This code is essentially useless because the data retrieved from the API is less extensive and no cleaner 
# than data scraped from the MediaWiki HTML. I've retained this for historical purposes.

# Initially, I thought that it was necessary to know the Wikibase entity ID to set the caption.
# So I wrote code to extract that from a query. However, one can use the page title, so that
# isn't actually necessary. But it might be needed anyway for the structured data part.

# The Wikibase entity ID can be used in lieu of the site+page name.
# The format is "M" plus the page ID. So page ID 41837276 has the entity ID M41837276
# Use action=query&prop=info&titles=File:Pluto-01_Stern_03_Pluto_Color_TXT.jpg and 
# extract the pageid field from result.

# Commons API examples: https://commons.wikimedia.org/wiki/Commons:API/MediaWiki
# sandbox: https://commons.wikimedia.org/wiki/Special:ApiSandbox#action=query&format=json&prop=categories%7Cimageinfo&titles=File%3AMasonry_patterns_in_doorway_Tetouan_Morocco.jpg
# General Sandbox index: https://commons.wikimedia.org/wiki/Special:ApiSandbox#action=query&format=json&prop=categories%7Cimageinfo&titles

def retrieve_commons_api_info(act_id, filename):
    apiUrl = 'https://commons.wikimedia.org/w/api.php'

    '''
    # get photos by a user
    params = {
        'action': 'query',
        'format': 'json',
        'list': 'allimages',
        'aiuser': 'Baskaufs',
        'aisort': 'timestamp'
    }
    '''

    '''
    # get category information about a photo. NOTE: fewer items than categories provided in the extmetadata option
    params = {
        'action': 'query',
        'format': 'json',
        'titles': 'File:Masonry_patterns_in_doorway_Tetouan_Morocco.jpg',
        'prop': 'categories|imageinfo'
    }
    '''

    '''
    # get raw metadata embedded in a photo
    params = {
        'action': 'query',
        'format': 'json',
        'titles': 'File:Masonry_patterns_in_doorway_Tetouan_Morocco.jpg',
        'prop': 'imageinfo',
        'iiprop': 'metadata',
        'iimetadataversion': 'latest'
    }
    '''

    '''
    # get metadata for a photo including from file page (does not produce much except the page title, basically the filename)
    params = {
        'action': 'query',
        'format': 'json',
        'titles': 'File:' + image_filename,
        'prop': 'info'
    }
    '''

    # get metadata for a photo including from file page (lots of data, but doesn't correspond exactly to 
    # data on image web page)
    params = {
        'action': 'query',
        'format': 'json',
        'titles': 'File:' + filename,
        'prop': 'imageinfo',
        'iiprop': 'extmetadata'
    }

    response = requests.get(apiUrl, params=params)
    data = response.json()
    # print(json.dumps(data, indent=2))

    info = {'act_id': act_id, 'filename': filename} # dict to collect results
    pages = data['query']['pages']
    # Oddly, the only key within pages object is the Wikibase ID
    # Assuming there is only one ID per page title, just get the first one
    wikibase_id = list(pages.keys())[0]
    info['wikibase_id'] = wikibase_id
    # Also oddly, the imageinfo value is a dict. Not sure why an image would have multiple imageinfos, so just get 1st one
    item_data = pages[wikibase_id]['imageinfo'][0]['extmetadata']
    # print(json.dumps(item_data, indent = 2))
    # print(item_data.keys())

    if 'Categories' in item_data:
        categories_list = item_data['Categories']['value'].split('|')
    else:
        categories_list = []
    info['categories_list'] = json.dumps(categories_list)

    info['artist_name'] = ''
    info['artist_description'] = ''    
    info['user_name'] = ''
    info['user_url'] = ''

    if "Artist" in item_data:
        soup = BeautifulSoup(item_data['Artist']['value'], 'html.parser')
        if 'bdi' == soup.contents[0].name:
            try:
                info['artist_name'] = soup.bdi.a.span.string
            except:
                info['artist_name'] = str(soup)
            try:
                info['artist_description'] = soup.bdi.a.span['title']
            except:
                info['artist_description'] = ''
        if 'a' == soup.contents[0].name:
            try:
                info['user_name'] = soup.a.string
            except:
                info['user_name'] = str(soup)
            try:
                info['user_url'] = soup.a['href']
            except:
                info['user_url'] = ''
        if not('bdi' == soup.contents[0].name) and not('a' == soup.contents[0].name):
            info['artist_name'] = str(soup)

    if 'LicenseShortName' in item_data:
        info['license_name'] = item_data['LicenseShortName']['value']
    else:
        info['license_name'] = ''
    if 'License' in item_data:
        info['license_code'] = item_data['License']['value']
    else:
        info['license_code'] = ''

    if 'DateTimeOriginal' in item_data:
        info['dateTime_original_string'] = item_data['DateTimeOriginal']['value'].split('<')[0].strip()
    else:
        info['dateTime_original_string'] = ''

    if 'ImageDescription' in item_data:
        soup = BeautifulSoup(item_data['ImageDescription']['value'], 'html.parser')
        try:
            info['main_subject_P921_label'] = soup.div.a.span.string
            info['main_subject_P921_full'] = soup.div.a.span['title']
            info['main_subject_P921_qid'] = extract_localname(soup.div.a['href'])
        except:
            info['main_subject_P921_label'] = str(soup)
            info['main_subject_P921_full'] = ''
            info['main_subject_P921_qid'] = ''       
    else:
        info['main_subject_P921_label'] = ''
        info['main_subject_P921_full'] = ''
        info['main_subject_P921_qid'] = ''

    if 'ObjectName' in item_data:
        soup = BeautifulSoup(item_data['ObjectName']['value'], 'html.parser')
        # use regex to match any text
        my_regex = re.compile(".*")
        title_soup = soup.find_all(text=my_regex)
        title_list = list(title_soup)
        clean_title_list = [x.strip() for x in title_list if x != '\n'] # remove newline items from list of title strings
    else:
        clean_title_list = []
    if len(clean_title_list) > 0:
        info['title'] = clean_title_list[0]
    else:
        info['title'] = ''
    info['titles_list'] = json.dumps(clean_title_list)
    return info

    # print(json.dumps(info, indent = 2))


In [None]:
# ---------------------------
# Scrape data from table on image page
# ---------------------------

server_sleep = 0.2

# Guidelines for providing information using the Information template: https://commons.wikimedia.org/wiki/Template:Information
# Note special template for artwork having more extensive metadata: https://commons.wikimedia.org/wiki/Template:Artwork
# Historical photographs (e.g. museums) https://commons.wikimedia.org/wiki/Template:Photograph
# Art photo template adds to artwork template https://commons.wikimedia.org/wiki/Template:Art_Photo
# Credit line template provides attribution text requred for CC BY licenses https://commons.wikimedia.org/wiki/Template:Credit_line

file_path = '../../vandycite/act/processed_lists/works_already_in_wikidata.csv'
file_data = read_dict(file_path)

with open('all_fields.json', 'rt') as file_object:
    all_field_list = json.loads(file_object.read())
    field_list = all_field_list # stop using separate lists

# Create a list of fields to be used for the CSV output
output_fields = ['filename', 'template_type']
for field in all_field_list:
    output_fields.append(field)

nonstandard_fields = []
output_list = []
#if True:
for record in file_data:
    print(record['filename'])
    image_filename = record['filename']
    #image_filename = 'Fra_Angelico_-_The_Coronation_of_the_Virgin_-_WGA0630.jpg'
    # image_filename = 'Christ sur la mer de Galilée (Delacroix) Walters Art Museum 37.186.jpg'
    # image_filename = 'Christ_and_SocratesSAAM_1974.28.341A_B_1.jpg'
    # image_filename = "Christ's_temptation_(Monreale).jpg"

   # Create a dictionary with keys for all fields in the template type and empty string values
    output_dict = {'filename': image_filename}
    for field in all_field_list:
        output_dict[field] = []
    
    # Retrieve the page HTML
    page_url = 'https://commons.wikimedia.org/wiki/File:' + image_filename
    response = requests.get(page_url)
    
    # Create a soup object and find the file info table
    soup = BeautifulSoup(response.text, 'html.parser')
    tables = soup.findAll('table', class_= re.compile('fileinfotpl'))
    if len(tables) == 0:
        output_dict['template_type'] = 'none'
        print('No data table')
        print()
        output_list.append(output_dict)
        continue
    try:
        template_type = str(tables[0]['class'][0]).split('-')[-1]
    except:
        template_type = 'unknown'
    print('Page template:', template_type) # useful types are "information" and "artwork"
    print()
    output_dict['template_type'] = template_type
        
    # stop using separate lists
    #with open(template_type + '.json', 'rt') as file_object:
    #    field_list = json.loads(file_object.read())
    #print(field_list)
    
    # Step through all of the rows looking for columns with particular labels
    rows = tables[0].tbody.findAll('tr', recursive=False) # find only in direct child tr elements
    for row in rows:        
        columns = row.findAll('td')
        if len(columns) > 0:
            # In cases where the contents of the td tag are more complext than a simple string (e.g. if the 
            # contents include text and markup tags), the strings attribute generates a tuple-like iterable
            # of strings included inside the tag (a "generator"). The first iterable string is always the one we want
            string_list = []
            field_matched = False
            strings = columns[0].strings
            for string in strings:
                string_list.append(string)
            if len(string_list) > 0:
                field_name = string_list[0]
                for field in field_list:
                    if field_name.lower() == field:
                        # The value of field will remain as the last matched one
                        field_matched = True
                        break

            if columns[1].string:
                # Turn the string into a list of one string
                value_string = json.dumps([columns[1].string])
            else:
                # use regex to match any text
                my_regex = re.compile(".*")
                string_list = columns[1].find_all(text=my_regex)
                no_newlines_list = [x.strip() for x in string_list if x != '\n'] # remove newline items from list of strings
                clean_string_list = [x.strip() for x in no_newlines_list if x != ''] # remove newline items from list of strings
                if len(clean_string_list) == 1:
                    # clean string and turn into list of length 1
                    value_string = json.dumps([clean_string_list[0].strip()])
                else:
                    value_string = json.dumps(clean_string_list)
            value_string = value_string.strip()
                    
            if field_matched:
                #print('Matched standard field:', field_name)
                #print(value_string)
                # Insert the found value into the dict for the matched field
                output_dict[field] = value_string
            else:
                #print('Non-standard field:', field_name)
                #print(value_string)
                nonstandard_fields.append({'filename': image_filename, 'fieldname': field_name, 'value': value_string})
            #print()
    sleep(server_sleep) # Don't hit the web server too fast
    #print(json.dumps(output_dict, indent = 2))
    output_list.append(output_dict)
    #print('-----------------')
            
#print(json.dumps(nonstandard_fields, indent = 2))
#write_dicts_to_csv(nonstandard_fields, 'nonstandard_fields.csv', ['filename', 'fieldname', 'value'])
write_dicts_to_csv(output_list, 'already_in_templated_data.csv', output_fields)

print('done')

In [None]:
# ---------------------------
# Look for a Wikidata link on the image page
# ---------------------------

# This script finds the link for the tiny little Wikidata logo found on many pages that use the artwork template. 
# It's significant because this links to the abstract artwork even if the file represented on the Commons page
# isn't the one used as the value of the image (P18) property in Wikidata.

# For references on art in Wikidata, see https://www.wikidata.org/wiki/Wikidata:WikiProject_sum_of_all_paintings
# https://www.wikidata.org/wiki/Wikidata:WikiProject_Visual_arts/Item_structure

file_path = '../../vandycite/act/create_items/abstract_artworks_out.csv'
file_data = read_dict(file_path)

output_list = []
#if True:
for record in file_data:
    print(record['image'])

    # Retrieve the page HTML
    image_filename = record['image']
    # image_filename = 'Christ sur la mer de Galilée (Delacroix) Walters Art Museum 37.186.jpg'
    # image_filename = 'Ingobertus_001.jpg' # should not work as of 2022-01-13
    # image_filename = 'Fra_Filippo_Lippi_-_Madonna_and_Child_with_two_Angels_-_Uffizi.jpg'
    # image_filename = 'Drawing of Abbie Sweetwine treating injured.jpg' # should produce nothing
    page_url = 'https://commons.wikimedia.org/wiki/File:' + image_filename
    response = requests.get(page_url)
    
    # Create a soup object and find the file info table
    soup = BeautifulSoup(response.text, 'html.parser')
    # Restrict only to links found in the mediawiki image page content section
    tables = soup.findAll('table', class_ = re.compile('fileinfotpl-type-artwork'))
    if len(tables) > 0:
        # Find the rows in the image page content table
        rows = tables[0].findAll('tr')
        if len(rows) > 0:
            # The link to the Wikidata item will be in an href in the first table row only
            # Sometimes the artist link is to a Wikidata item, so can't screen on subdomain.
            anchors = rows[0].findAll('a', title = re.compile('wikidata:'))
            if len(anchors) > 0:
                link = anchors[0]['href']
                qid = extract_localname(link)
                print(qid)
                retrieved_data = {'act_id': record['act'], 'qid': qid, 'filename': record['image']}
                output_list.append(retrieved_data)
    sleep(read_api_sleep) # Don't hit the server too fast
    
fieldnames = ['act_id', 'qid', 'filename']
write_dicts_to_csv(output_list, '../../vandycite/act/create_items/wikidata_found.csv', fieldnames)

print('done')
