In [1]:
# vandercommonsdatabot.ipynb, a Python script for collecting Wikimedia Commons pageview data
version = '0.1'
created = '2021-12-14'

# (c) 2021 Vanderbilt University. This program is released under a GNU General Public License v3.0 http://www.gnu.org/licenses/gpl-3.0
# Author: Steve Baskauf

# IMPORTANT NOTE: If you hack this script to download your own data, you MUST change the user_agent_header
# to your own URL and email address if you make modifications that affect how the script
# interacts with the API. In particular, DO NOT decrease the value of api_sleep below 0.01 .

# import, configuration, functions, etc.
# Use pip install PyGithub to install the github module

import json
import requests
from time import sleep
import csv
import io
import datetime
from pathlib import Path
import urllib.parse
from github import Github

# see Pageviews API information https://wikitech.wikimedia.org/wiki/Analytics/AQS/Pageviews
# see Wikimedia REST API information https://wikimedia.org/api/rest_v1/

# reference on PyGithub: https://pygithub.readthedocs.io/en/latest/github_objects/Repository.html
# reference on GitHub API: https://developer.github.com/v3/guides/getting-started/

api_sleep = 0.015 # number of seconds to wait between calls to the API, rate limit 100 calls/s
accept_media_type = 'application/json'

commons_page_prefix = 'https://commons.wikimedia.org/wiki/File:'

github_username = ''  # set to empty string if using a token (for 2FA)
organization_name = 'heardlibrary'
organization_is_user = False
repo_name = 'dashboard'
cred_directory = 'home' # set to 'home' if the credential is in the home directory, otherwise working directory
path_to_directory = 'gallery/'

# -----------------
# utility functions
# -----------------

# NOTE: change the user_agent_header string to something appropriate for your project
def generate_header_dictionary(accept_media_type):
    user_agent_header = 'VanderCommonsDataBot/' + version + ' (https://github.com/HeardLibrary/dashboard/tree/master/gallery; mailto:steve.baskauf@vanderbilt.edu)'
    request_header_dictionary = {
        'Accept' : accept_media_type,
        'User-Agent': user_agent_header
    }
    return request_header_dictionary

def generate_utc_date():
    whole_time_string_z = datetime.datetime.utcnow().isoformat() # form: 2019-12-05T15:35:04.959311
    date_z = whole_time_string_z.split('T')[0] # form 2019-12-05
    return date_z

def yesterday_utc():
    today = datetime.datetime.utcnow().toordinal() # get today as number of days from Jan 1, 1 CE
    yesterday = datetime.datetime.fromordinal(today - 1) # turn ordinal day back into dateTime object
    yesterday_iso = yesterday.strftime('%Y-%m-%d')
    yesterday_wikimedia = yesterday.strftime('%Y%m%d')
    return yesterday_iso, yesterday_wikimedia
    
def filename_to_commons_page_article(filename):
    filename = filename.replace(' ', '_')
    encoded_filename = urllib.parse.quote(filename)
    url = 'File:' + encoded_filename
    url = url.replace('%28', '(').replace('%29', ')').replace('%2C', ',')
    return url

# read from a CSV file into a list of dictionaries (representing a table)
def read_dicts_from_csv(filename):
    with open(filename, 'r', newline='', encoding='utf-8') as file_object:
        dict_object = csv.DictReader(file_object)
        table = []
        for row in dict_object:
            table.append(row)
    return table

# read raw string from a file in GitHub
def read_string_from_github_file(organization_name, repo_name, path_to_directory, filename):
    path = path_to_directory + filename
    r = requests.get('https://raw.githubusercontent.com/' + organization_name + '/' + repo_name + '/master/' + path)
    return r.text

# read from a CSV file in GitHub into a list of dictionaries (representing a table)
def read_dicts_from_github_csv(organization_name, repo_name, path_to_directory, filename):
    path = path_to_directory + filename
    r = requests.get('https://raw.githubusercontent.com/' + organization_name + '/' + repo_name + '/master/' + path)
    file_text = r.text.split('\n')
    file_rows = csv.DictReader(file_text)
    table = []
    for row in file_rows:
        table.append(row)
    return table

# read from a CSV file in GitHub into a list of lists (representing a table)
def read_lists_from_github_csv(organization_name, repo_name, path_to_directory, filename):
    path = path_to_directory + filename
    r = requests.get('https://raw.githubusercontent.com/' + organization_name + '/' + repo_name + '/master/' + path)
    file_text = r.text.split('\n')
    # remove any trailing newlines
    if file_text[len(file_text)-1] == '':
        file_text = file_text[0:len(file_text)-1]
    file_rows = csv.reader(file_text)
    table = []
    for row in file_rows:
        table.append(row)
    return table

# write a list of dictionaries to a CSV file
def write_dicts_to_csv(table, filename, fieldnames):
    with open(filename, 'w', newline='', encoding='utf-8') as csv_file_object:
        writer = csv.DictWriter(csv_file_object, fieldnames=fieldnames)
        writer.writeheader()
        for row in table:
            writer.writerow(row)

# write a list of dictionaries to a CSV file
def write_dicts_to_string(table, fieldnames):
    output = io.StringIO()
    writer = csv.DictWriter(output, fieldnames=fieldnames)
    writer.writeheader()
    for row in table:
        writer.writerow(row)
    return output.getvalue()

# write a list of lists to a CSV file
def write_lists_to_string(table):
    output = io.StringIO()
    writer = csv.writer(output)
    for row in table:
        writer.writerow(row)
    return output.getvalue()

# -----------------
# functions for interacting with APIs
# -----------------

# This function sends a request to the Wikimedia REST API to get pageviews for an article
def get_pageview_counts(project, article, date):
    # project is the subdomain (e.g. "commons.wikimedia.org")
    # article is the page name after 'wiki/' in the URL (e.g. "File:imagex.jpg", or "Q2")
    # date is in the format yyyymmdd
    query_url = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/' + project + '/all-access/user/' + article + '/daily/' + date + '/' + date
    #print(query_url)
    r = requests.get(query_url, headers = generate_header_dictionary(accept_media_type))
    try:
        data = r.json()
        if 'items' in data:
            #print('Found record successfully.')
            value = str(data['items'][0]['views'])
        else:
            # Handle case where there were no views on that day or article didn't exist
            #print('Did not find record.')
            if data['title'] == 'Not found.':
                value = str(0)
            else:
                value = r.text
    except:
        # Error messages not in JSON format
        #print('Error with API response')
        value = r.text
    # delay to avoid hitting the API to rapidly
    sleep(api_sleep)
    return value

# -----------------
# functions for interacting with GitHub
# -----------------

# value of directory should be either 'home' or 'working'
def load_credential(filename, directory):
    cred = ''
    # to change the script to look for the credential in the working directory, change the value of home to empty string
    if directory == 'home':
        home = str(Path.home()) #gets path to home directory; supposed to work for Win and Mac
        credential_path = home + '/' + filename
    else:
        directory = 'working'
        credential_path = filename
    try:
        with open(credential_path, 'rt', encoding='utf-8') as file_object:
            cred = file_object.read()
    except:
        print(filename + ' file not found - is it in your ' + directory + ' directory?')
        exit()
    return(cred)

# pass in an empty string for organization_name to use an individual account
# pass in an empty string for github_username to use a token instead of username login
def login_get_repo(repo_name, github_username, organization_name, organization_is_user, cred_directory):
    if github_username == '':
        token = load_credential('linked-data_github_token.txt', cred_directory)
        g = Github(login_or_token = token)
    else:
        pwd = load_credential('pwd.txt', cred_directory)
        g = Github(github_username, pwd)
    
    if organization_is_user:
        # this option accesses a user's repo instead of an organizational one
        # In this case, the value of organization_name is not used.
        user = g.get_user()
        repo = user.get_repo(repo_name)
    else:
        # this option creates an instance of a repo in an organization
        # to which the token creator has push access
        organization = g.get_organization(organization_name)
        repo = organization.get_repo(repo_name)
    return(repo)

def get_user_list(repo):
    person_list = []
    people = repo.get_collaborators()
    for person in people:
        person_list.append(person.login)
    return person_list

def get_file_sha(account, repo, file_path):
    # get the data about the file to get its blob SHA

    r = requests.get('https://api.github.com/repos/' + account + '/' + repo + '/contents/' + file_path)
    file_data = r.json()
    try:
        sha = file_data['sha']
    except:
        # if the file doesn't already exist on GitHub, no sha will be returned
        sha = ''
    return sha

# use this function to update an existing text file
def update_file(account, repo_name, path_to_directory, filename, content):
    path = path_to_directory + filename
    commit_message = 'Update ' + filename + ' file via API'
    sha = get_file_sha(account, repo_name, path)
    if sha == '':
        response = repo.create_file(path, commit_message, content)
    else:
        response = repo.update_file(path, commit_message, content, sha)
    return response

# -----------------
# top-level functions for acquiring the main dataset
# -----------------

# Retrieves the total contributions for all of the participants in the VandyCite project
# If it fails due to timeout or some other error, the table remains unchanged
# Returns a raw CSV string
def get_commons_pageview_counts(organization_name, repo_name, path_to_directory, table):

    # Get Commons image data
    user_dicts = read_dicts_from_github_csv(organization_name, repo_name, path_to_directory, 'commons_images.csv')

    # Create column headers list
    mid_list = [] # M IDs are the Commons equivalents of Q IDs used by the Structured data Wikibase
    for dict_record in user_dicts:
        mid_list.append(dict_record['commons_id'])
        
    # Retrieve data from the Wikimedia REST API
    project = 'commons.wikimedia.org'
    yesterday_iso, yesterday_wikimedia = yesterday_utc()
    #yesterday_iso = '2021-12-11' # uncomment to override date to be checked
    #yesterday_wikimedia = '20211211' # uncomment to override date to be checked

    fieldnames = ['date', 'total'] + mid_list
    row_dict = {'date': yesterday_iso}

    total = 0
    for dict_record in user_dicts:
        image_filename = dict_record['image_name']
        print(image_filename)
        tries = 0
        success = False
        # try to acquire the data for an hour
        while (success == False) and (tries < 12):
            try:
                count = get_pageview_counts(project, filename_to_commons_page_article(image_filename), yesterday_wikimedia)
                success = True
                row_dict[dict_record['commons_id']] = count
                total += int(count)
            except:
                tries += 1
                sleep(300) # wait 5 minutes and try again
    row_dict['total'] = str(total)

    if success:
        table.append(row_dict)

    #print(json.dumps(table, indent = 2))
    return write_dicts_to_string(table, fieldnames)



In [None]:
while True: # infinite loop
    try:
        print('Time checked:', datetime.datetime.utcnow().isoformat())

        date_last_run = read_string_from_github_file(organization_name, repo_name, path_to_directory, 'last_run.txt')
        print('Date last run:', date_last_run)

        date_now_utc = generate_utc_date()
        print('UTC date now is:', date_now_utc)

        if date_now_utc > date_last_run:
            # log into the GitHub API and create a repo instance
            repo = login_get_repo(repo_name, github_username, organization_name, organization_is_user, cred_directory)

            # Record today's counts of contrubutions to Wikidata by VandyCite members
            filename = 'commons_pageview_data.csv'
            table = read_dicts_from_github_csv(organization_name, repo_name, path_to_directory, filename)
            rawCsvText = get_commons_pageview_counts(organization_name, repo_name, path_to_directory, table)
            response = update_file(organization_name, repo_name, path_to_directory, filename, rawCsvText)
            print(response)

            # Update the date last run
            response = update_file(organization_name, repo_name, path_to_directory, 'last_run.txt', generate_utc_date() )
            print('done')
        print()
        # wait an hour before checking again
        sleep(3600)
    except:
        print('Error occurred, trying again in 10 minutes')
        sleep(600)
        continue

Time checked: 2021-12-14T15:38:22.015730
Date last run: 2021-12-13
UTC date now is: 2021-12-14
A Conversation with Guido di Brettinoro - Vanderbilt Fine Arts Gallery - 1984.021.tif
Adoration of the Sheperds - Vanderbilt Fine Arts Gallery - 1979.0264P.tif
Album painting of two men in a cottage in a mountain landscape - Vanderbilt Fine Arts Gallery - 1993.222.tif
Autumn River - Vanderbilt Fine Arts Gallery - 1979.0303P.tif
Caney Fork Cabin - Vanderbilt Fine Arts Gallery - 1973.005.tif
Capt. Sterrett in the Schr Enterprise paying tribute to Tripoli, August 1801 - Vanderbilt Fine Arts Gallery - 1979.0121.tif
Chinese Winter Landscape - Vanderbilt Fine Arts Gallery - 1978.014.tif
Christ Preaching - Vanderbilt Fine Arts Gallery - 1973.017.tif
Dante Discoursing with Cacciaguida (Paradisio, Canto 17) from Illustrations to Dante's Divine Comedy - Vanderbilt Fine Arts Gallery - 1984.022.tif
Death of Absalom - Vanderbilt Fine Arts Gallery - 1971.006.tif
Elegant Lovers - Vanderbilt Fine Arts Galler

Parvati on Nandi - Vanderbilt Fine Arts Gallery - 1971.068.tif
Portrait - Vanderbilt Fine Arts Gallery - 1975.018.tif
Portrait of a Gentleman in a Garden - Vanderbilt Fine Arts Gallery - 1990.004.tif
Portrait of a Gentleman in Profile - Vanderbilt Fine Arts Gallery - 1991.024.tif
Portrait of a Lady - Vanderbilt Fine Arts Gallery - 1979.0176.tif
Portrait of a Man - Vanderbilt Fine Arts Gallery - 1986.013.tif
Portrait of a Man in Armor - Vanderbilt Fine Arts Gallery - 1992.335.JPG
Portrait of a Man in Red - Vanderbilt Fine Arts Gallery - 1992.336.JPG
Portrait of a Young Girl - Vanderbilt Fine Arts Gallery - 1986.008.tif
Portrait of a Young Woman in Profile - Vanderbilt Fine Arts Gallery - 1991.023.tif
Portrait of Edward Bridgen - Vanderbilt Fine Arts Gallery - 1984.032.tif
Portrait of Joseph Montgomery Peters - Vanderbilt Fine Arts Gallery - 1971.001.tif
Portrait of Samuel Stephens Jr. - Vanderbilt Fine Arts Gallery - 1971.003.tif
Rural Landscape with Cows - Vanderbilt Fine Arts Gallery 

A page from an album of hand-colored woodblock prints depicting ancient sages - Vanderbilt Fine Arts Gallery - 1995.016h.tif
A page from an album of hand-colored woodblock prints depicting ancient sages - Vanderbilt Fine Arts Gallery - 1995.016i.tif
A page from an album of hand-colored woodblock prints depicting ancient sages - Vanderbilt Fine Arts Gallery - 1995.017a.tif
A page from an album of hand-colored woodblock prints depicting ancient sages - Vanderbilt Fine Arts Gallery - 1995.017b.tif
A page from an album of hand-colored woodblock prints depicting ancient sages - Vanderbilt Fine Arts Gallery - 1995.017c.tif
A page from an album of hand-colored woodblock prints depicting ancient sages - Vanderbilt Fine Arts Gallery - 1995.017d.tif
A page from an album of hand-colored woodblock prints depicting ancient sages - Vanderbilt Fine Arts Gallery - 1995.017e.tif
A page from an album of hand-colored woodblock prints depicting ancient sages - Vanderbilt Fine Arts Gallery - 1995.017f.tif
