In [None]:
# commonsbot.ipynb, a Python script for uploading files and data to Wikimedia Commons using the API.

# (c) 2022 Vanderbilt University. This program is released under a GNU General Public License v3.0 http://www.gnu.org/licenses/gpl-3.0
# Author: Steve Baskauf

# ----------------
# Global variables
# ----------------

script_version = '0.5.1'
version_modified = '2022-08-24'
commons_prefix = 'http://commons.wikimedia.org/wiki/Special:FilePath/'
commons_page_prefix = 'https://commons.wikimedia.org/wiki/File:'

# -----------------------------------------
# Version 0.4 change notes: 
# - Removed double spaces from labels before they are used to generate image filenames.
# - Skip over images with raw filenames that contain spaces and log an error for them to be manually removed.
# -----------------------------------------
# Version 0.5.1 change notes:
# - enable writing of multiple Structured Data in Commons claims in single API call
# - support both Artwork (for 2D) and Art Photo (for 3D) templates in the Wikitext
# - use appropriate SDC licenses for 3D works
# - clean up code and convert login to an object
# - remove hard-coded values and replace with YAML configuration file
# - improve control of throttling between media file uploads to the Commons API
# -----------------------------------------

# Generic Commons API reference: https://commons.wikimedia.org/w/api.php

# Description of bots on Commons: https://commons.wikimedia.org/wiki/Commons:Bots
# See guidelines for operating a bot in Commons: https://commons.wikimedia.org/wiki/Commons:Bots/Requests
# Need to decide whether this applies if non autonomous. It probably does.
# Bot flag is an indication of community trust and prevents new images/recent changes lists from getting swamped.
# It's also an indication of community trust; confirms edits not likely to need manual checking

# ----------------
# Module imports
# ----------------

import json
import yaml
import requests
import csv
from pathlib import Path
from time import sleep
import sys
import re # regex. Function to check for the particular form of xsd:dateTime required for full dates in Wikidata
from datetime import datetime
import os
import pandas as pd
import urllib.parse
import webbrowser

# AWS Python SDK
import boto3
import botocore


# ------------------------
# Utility functions
# ------------------------

def read_dict(filename):
    """Read from a CSV file into a list of dictionaries."""
    with open(filename, 'r', newline='', encoding='utf-8') as file_object:
        dict_object = csv.DictReader(file_object)
        array = []
        for row in dict_object:
            array.append(row)
    return array

def write_dicts_to_csv(table, filename, fieldnames):
    """Write a list of dictionaries to a CSV file."""
    with open(filename, 'w', newline='', encoding='utf-8') as csv_file_object:
        writer = csv.DictWriter(csv_file_object, fieldnames=fieldnames)
        writer.writeheader()
        for row in table:
            writer.writerow(row)
            

# ------------------------
# Commons identifier/URL conversion functions
# ------------------------

# There are four identifiers used in Commons:

# The most basic one is the filename, unencoded and with file extension.

# The Commons web page URL is formed from the filename by prepending a subpath and "File:", replacing spaces in the filename with _, and URL-encoding the file name string
# The reverse process may be lossy because it assumes that underscores should be turned into spaces and the filename might actuall contain underscores.

# The Wikidata IRI identifier for the image is formed from the filename by URL-encoding it and prepending a subpath and "Special:FilePath/"
# It the reverse process is lossless since it simply reverse URL-encodes the local name part of the IRI.

# Each media page is also identified by an M ID, which is the Commons equivalent of a Q ID. Since structured
# data on Commons is based on a Wikibase instance, the M ID is used when writing structured data to the API.

def commons_url_to_filename(url):
    """Convert a Wikidata IRI identifier to an unencoded file name.
    
    Note
    ----
    The form of the URL is: http://commons.wikimedia.org/wiki/Special:FilePath/Castle%20De%20Haar%20%281892-1913%29%20-%20360%C2%B0%20Panorama%20of%20Castle%20%26%20Castle%20Grounds.jpg
    """
    string = url.split(commons_prefix)[1] # get local name file part of URL
    filename = urllib.parse.unquote(string) # reverse URL-encode the string
    return filename

def filename_to_commons_url(filename):
    """Convert a raw file name to a Wikidata IRI identifier."""
    encoded_filename = urllib.parse.quote(filename)
    url = commons_prefix + encoded_filename
    return url

def commons_page_url_to_filename(url):
    """Convert a Commons web page URL to a raw file name.
    
    Note
    ----
    The form of the URL is: https://commons.wikimedia.org/wiki/File:Castle_De_Haar_(1892-1913)_-_360%C2%B0_Panorama_of_Castle_%26_Castle_Grounds.jpg
    """
    string = url.split(commons_page_prefix)[1] # get local name file part of URL
    string = string.replace('_', ' ')
    filename = urllib.parse.unquote(string) # reverse URL-encode the string
    return filename

def filename_to_commons_page_url(filename):
    """Convert a raw file name to a Commons web page URL."""
    filename = filename.replace(' ', '_')
    encoded_filename = urllib.parse.quote(filename)
    url = commons_page_prefix + encoded_filename
    url = url.replace('%28', '(').replace('%29', ')').replace('%2C', ',')
    return url

def get_commons_image_pageid(image_filename):
    """Look up the Commons image page ID ("M ID") using the image file name.
    
    Note
    ----
    The wbeditentity_upload function (which writes to a Wikibase API) needs the M ID, 
    the structured data on Commons equivalent of a Q ID. 
    """
    # get metadata for a photo including from file page
    params = {
        'action': 'query',
        'format': 'json',
        'titles': 'File:' + image_filename,
        'prop': 'info'
    }

    response = requests.get('https://commons.wikimedia.org/w/api.php', params=params)
    data = response.json()
    #print(json.dumps(data, indent=2))
    page_dict = data['query']['pages'] # this value is a dict that has the page IDs as keys
    page_id_list = list(page_dict.keys()) # the result of the .keys() method is a "dict_keys" object, so coerce to a list
    page_id = page_id_list[0] # info on only one page was requested, so get item 0
    #print('Page ID:',page_id)
    
    # Don't think I need to add a sleep time for API reads, which are less resource-intensive
    # than write operations. Also, only single requests are being made between operations that are time-consuming.
    # NOTE: appears to return '-1' when it can't find the page.
    return page_id
    

# ---------------------------
# Body of main script
# ---------------------------

# This section contains configuration information and performs necessary logins
# No writing is done, so it's "safe" to run any time

# This section needs to be run prior to running any code that interacts with the Commons API
# It generates the CSRF token required to post to the API on behalf of the user whose username and pwd are being used

print('Loading data')

# Load configuration values
with open('commonsbot_config.yml', 'r') as file:
    config_values = yaml.safe_load(file)

if config_values['working_directory_path'] != '':
    # Change working directory to image upload directory
    os.chdir(config_values['working_directory_path'])
    
# These files are all relative to the current working directory

# Note: setting the index to be the Q ID requires that qid has a unique value for each row. This should be the case.
works_metadata = pd.read_csv('../works_multiprop.csv', na_filter=False, dtype = str)
works_metadata.set_index('qid', inplace=True)

raw_metadata = pd.read_csv('../gallery_works_renamed1.csv', na_filter=False, dtype = str)
raw_metadata.set_index('accession_number', inplace=True)

works_classification = pd.read_csv('../../gallery_buchanan/works_classification.csv', na_filter=False, dtype = str)
works_classification.set_index('qid', inplace=True)

works_ip_status = pd.read_csv('../items_status_abbrev.csv', na_filter=False, dtype = str)
works_ip_status.set_index('qid', inplace=True)

'''
existing_images = pd.read_csv('commons_images.csv', na_filter=False, dtype = str) # Don't make the Q IDs the index!
ip_status = works_ip_status.loc[index, 'status']
if image_dimension_series['height'] * image_dimension_series['width'] < config_values['minimum_pixel_squared']:
    print('Image too small.')
image_metadata['creator_string'] = raw_metadata.loc[image_metadata['inventory_number']]['creator_string']
'''

inventory_numbers = works_metadata['inventory_number'].tolist()
print(inventory_numbers)

image_dimensions = pd.read_csv('image_dimensions.csv', na_filter=False, dtype = str)
# Convert some columns to integers
image_dimensions[['kilobytes', 'height', 'width']] = image_dimensions[['kilobytes', 'height', 'width']].astype(int)

#image = image_dimensions.head(500)
images = image_dimensions

regex = '^[0-9]+$'



In [None]:
for index, image in images.iterrows():
    acc_index_pieces = image['accession'].split('.')[1:]
    acc_index = '.'.join(acc_index_pieces)
    #print(acc_index)
    numeric = bool(re.search(regex, acc_index))
    if not numeric:
        if acc_index[-1] != 'P':
            constructed_inven = str(image['subdir']) + '.' + acc_index
            if not constructed_inven in inventory_numbers:
                print(constructed_inven)
    #print()

print('done')