In [13]:
# import, configuration, functions, etc.

import json
import requests
from time import sleep
import csv
import io
import datetime
from pathlib import Path
from github import Github

# the access token should be generated for read/write access to public repos
# see https://developer.github.com/v3/auth/#working-with-two-factor-authentication
# see https://github.com/settings/tokens/new
# select public_repo

# reference on PyGithub: https://pygithub.readthedocs.io/en/latest/github_objects/Repository.html
# reference on GitHub API: https://developer.github.com/v3/guides/getting-started/

sparql_sleep = 0.1 # number of seconds to wait between queries to SPARQL endpoint
accept_media_type = 'application/json'

github_username = ''  # set to empty string if using a token (for 2FA)
organization_name = 'heardlibrary'
organization_is_user = False
repo_name = 'linked-data'
cred_directory = 'home' # set to 'home' if the credential is in the home directory, otherwise working directory
path_to_directory = 'publications/data/'

# -----------------
# utility functions
# -----------------

# NOTE: change the user_agent_header string to something appropriate for your project
def generate_header_dictionary(accept_media_type):
    user_agent_header = 'VanderDataBot/0.1 (https://github.com/HeardLibrary/linked-data/tree/master/publications/data; mailto:steve.baskauf@vanderbilt.edu)'
    request_header_dictionary = {
        'Accept' : accept_media_type,
        'User-Agent': user_agent_header
    }
    return request_header_dictionary

def generate_utc_date():
    whole_time_string_z = datetime.datetime.utcnow().isoformat() # form: 2019-12-05T15:35:04.959311
    date_z = whole_time_string_z.split('T')[0] # form 2019-12-05
    return date_z

# extracts the qNumber from a Wikidata IRI
def extract_qnumber(iri):
    # pattern is http://www.wikidata.org/entity/Q6386232
    pieces = iri.split('/')
    return pieces[4]

# read from a CSV file into a list of dictionaries (representing a table)
def read_dicts_from_csv(filename):
    with open(filename, 'r', newline='', encoding='utf-8') as file_object:
        dict_object = csv.DictReader(file_object)
        table = []
        for row in dict_object:
            table.append(row)
    return table

# read raw string from a file in GitHub
def read_string_from_github_file(organization_name, repo_name, path_to_directory, filename):
    path = path_to_directory + filename
    r = requests.get('https://raw.githubusercontent.com/' + organization_name + '/' + repo_name + '/master/' + path)
    return r.text

# read from a CSV file in GitHub into a list of dictionaries (representing a table)
def read_dicts_from_github_csv(organization_name, repo_name, path_to_directory, filename):
    path = path_to_directory + filename
    r = requests.get('https://raw.githubusercontent.com/' + organization_name + '/' + repo_name + '/master/' + path)
    file_text = r.text.split('\n')
    file_rows = csv.DictReader(file_text)
    table = []
    for row in file_rows:
        table.append(row)
    return table

# read from a CSV file in GitHub into a list of lists (representing a table)
def read_lists_from_github_csv(organization_name, repo_name, path_to_directory, filename):
    path = path_to_directory + filename
    r = requests.get('https://raw.githubusercontent.com/' + organization_name + '/' + repo_name + '/master/' + path)
    file_text = r.text.split('\n')
    # remove any trailing newlines
    if file_text[len(file_text)-1] == '':
        file_text = file_text[0:len(file_text)-1]
    file_rows = csv.reader(file_text)
    table = []
    for row in file_rows:
        table.append(row)
    return table

# write a list of dictionaries to a CSV file
def write_dicts_to_csv(table, filename, fieldnames):
    with open(filename, 'w', newline='', encoding='utf-8') as csv_file_object:
        writer = csv.DictWriter(csv_file_object, fieldnames=fieldnames)
        writer.writeheader()
        for row in table:
            writer.writerow(row)

# write a list of dictionaries to a CSV file
def write_dicts_to_string(table, fieldnames):
    output = io.StringIO()
    writer = csv.DictWriter(output, fieldnames=fieldnames)
    writer.writeheader()
    for row in table:
        writer.writerow(row)
    return output.getvalue()

# write a list of lists to a CSV file
def write_lists_to_string(table):
    output = io.StringIO()
    writer = csv.writer(output)
    for row in table:
        writer.writerow(row)
    return output.getvalue()

# -----------------
# functions for interacting with APIs
# -----------------

# This function sends a query to a SPARQL endpoint and returns a single value.
# For the Wikidata SPARQL endpoint, it extracts "single_value" from the query.
def get_single_value(query, endpoint_url):
    r = requests.get(endpoint_url, params={'query' : query}, headers = generate_header_dictionary(accept_media_type))
    try:
        data = r.json()
        #print(json.dumps(data, indent=2))
        
        # Extract value from response JSON depending on the API type
        if endpoint_url == 'https://query.wikidata.org/sparql':
            value = data['results']['bindings'][0]['single_value']['value']
    except:
        value = [r.text]
    # delay to avoid hitting the SPARQL endpoint to rapidly
    sleep(sparql_sleep)
    return value

# This function sends a query to the Wikidata SPARQL endpoint that searches for
# counts related to all subsidiary units of Vanderbilt. The function returns a list of
# dictionaries with the Q ID and count for each unit.
def get_unit_counts(query):
    table = []
    endpoint_url = 'https://query.wikidata.org/sparql'
    accept_media_type = 'application/json'
    r = requests.get(endpoint_url, params={'query' : query}, headers = generate_header_dictionary(accept_media_type))
    try:
        data = r.json()
        statements = data['results']['bindings']
        for statement in statements:
            unit_iri = statement['unit']['value']
            unit_qnumber = extract_qnumber(unit_iri)
            count = statement['count']['value']
            table.append({'unit': unit_qnumber, 'count': count})
    except:
        table = [r.text]
    # delay to avoid hitting the SPARQL endpoint to rapidly
    sleep(sparql_sleep)
    return table

# This function sends a query to the XTools Edit Counter and returns a single value.
def get_xtools_edit_counts(username, project, namespace):
    query_url = 'https://xtools.wmflabs.org/api/user/simple_editcount/' + project + '/' + username + '/' + namespace
    #query_url = 'https://xtools.wmflabs.org/api/user/simple_editcount/' + project + '/' + username + '/' + namespace + '/' + start_date + '/' + end_date
    r = requests.get(query_url, headers = generate_header_dictionary(accept_media_type))
    try:
        data = r.json()
        #print(json.dumps(data, indent=2))
        
        value = data['live_edit_count']
    except:
        value = [r.text]
    # delay to avoid hitting the API to rapidly
    sleep(sparql_sleep)
    return value

# -----------------
# functions for interacting with GitHub
# -----------------

# value of directory should be either 'home' or 'working'
def load_credential(filename, directory):
    cred = ''
    # to change the script to look for the credential in the working directory, change the value of home to empty string
    if directory == 'home':
        home = str(Path.home()) #gets path to home directory; supposed to work for Win and Mac
        credential_path = home + '/' + filename
    else:
        directory = 'working'
        credential_path = filename
    try:
        with open(credential_path, 'rt', encoding='utf-8') as file_object:
            cred = file_object.read()
    except:
        print(filename + ' file not found - is it in your ' + directory + ' directory?')
        exit()
    return(cred)

# pass in an empty string for organization_name to use an individual account
# pass in an empty string for github_username to use a token instead of username login
def login_get_repo(repo_name, github_username, organization_name, organization_is_user, cred_directory):
    if github_username == '':
        token = load_credential('linked-data_github_token.txt', cred_directory)
        g = Github(login_or_token = token)
    else:
        pwd = load_credential('pwd.txt', cred_directory)
        g = Github(github_username, pwd)
    
    if organization_is_user:
        # this option accesses a user's repo instead of an organizational one
        # In this case, the value of organization_name is not used.
        user = g.get_user()
        repo = user.get_repo(repo_name)
    else:
        # this option creates an instance of a repo in an organization
        # to which the token creator has push access
        organization = g.get_organization(organization_name)
        repo = organization.get_repo(repo_name)
    return(repo)

def get_user_list(repo):
    person_list = []
    people = repo.get_collaborators()
    for person in people:
        person_list.append(person.login)
    return person_list

def get_file_sha(account, repo, file_path):
    # get the data about the file to get its blob SHA
    r = requests.get('https://api.github.com/repos/' + account + '/' + repo + '/contents/' + file_path)
    file_data = r.json()
    try:
        sha = file_data['sha']
    except:
        # if the file doesn't already exist on GitHub, no sha will be returned
        sha = ''
    return sha

# use this function to update an existing text file
def update_file(account, repo_name, path_to_directory, filename, content):
    path = path_to_directory + filename
    commit_message = 'Update ' + filename + ' file via API'
    sha = get_file_sha(account, repo_name, path)
    if sha == '':
        response = repo.create_file(path, commit_message, content)
    else:
        response = repo.update_file(path, commit_message, content, sha)
    return response

# -----------------
# top-level functions for acquiring the main datasets
# -----------------

# Retrieves the total contributions for all of the participants in the VandyCite project
# If it fails due to timeout or some other error, the table remains unchanged
# Returns a raw CSV string
def get_vandycite_contribution_counts(organization_name, repo_name, path_to_directory, table):
    # Get username list
    vandycite_user_list = []
    user_dicts = read_dicts_from_github_csv(organization_name, repo_name, path_to_directory, 'vandycite_users.csv')
    for dict in user_dicts:
        vandycite_user_list.append(dict['username'])

    # Retrieve data from XTools Edit Counter API
    project = 'wikidata'
    namespace = '0' # 0 is the main namespace

    fieldnames = ['date'] + vandycite_user_list
    today = generate_utc_date()
    row_dict = {'date': today}

    total = 0
    for username in vandycite_user_list:
        print(username)
        tries = 0
        success = False
        # try to acquire the data for an hour
        while (success == False) and (tries < 12):
            try:
                count = get_xtools_edit_counts(username, project, namespace)
                success = True
                row_dict[username] = count
                total += int(count)
            except:
                tries += 1
                sleep(300) # wait 5 minutes and try again
    row_dict['total'] = str(total)
    
    if success:
        table.append(row_dict)

    return write_dicts_to_string(table, fieldnames)

# Runs all of the queries that retrieve a single value for the whole university
# If it fails due to timeout or some other error, the table remains unchanged
# Returns a raw CSV string
def get_vu_counts(table):
    all_vu_query_list = [
        {'name': 'vu_total',
        'query': '''
        select (count(distinct ?person) as ?single_value)  where {
          ?unit wdt:P749+ wd:Q29052.
          ?person wdt:P1416 ?unit.
          }
        '''},
        {'name': 'vu_men',
        'query': '''
        select (count(distinct ?man) as ?single_value)  where {
          ?unit wdt:P749+ wd:Q29052.
          ?man wdt:P1416 ?unit.
          ?man wdt:P21 wd:Q6581097.
          }
        '''},
        {'name': 'vu_women',
        'query': '''
        select (count(distinct ?woman) as ?single_value)  where {
          ?unit wdt:P749+ wd:Q29052.
          ?woman wdt:P1416 ?unit.
          ?woman wdt:P21 wd:Q6581072.
          }
        '''},
        {'name': 'vu_orcid',
        'query': '''
        select (count(distinct ?person) as ?single_value)  where {
          ?unit wdt:P749+ wd:Q29052.
          ?person wdt:P1416 ?unit.
          ?person wdt:P496 ?orcid.
          }
        '''},
        {'name': 'vu_works',
        'query': '''
        select (count(distinct ?work) as ?single_value)  where {
          ?unit wdt:P749+ wd:Q29052.
          ?person wdt:P1416 ?unit.
          ?work wdt:P50 ?person.
          }
        '''},
        {'name': 'vu_men_works',
        'query': '''
        select (count(distinct ?work) as ?single_value)  where {
          ?unit wdt:P749+ wd:Q29052.
          ?man wdt:P1416 ?unit.
          ?man wdt:P21 wd:Q6581097.
          ?work wdt:P50 ?man.
          }
        '''},
        {'name': 'vu_women_works',
        'query': '''
        select (count(distinct ?work) as ?single_value)  where {
          ?unit wdt:P749+ wd:Q29052.
          ?woman wdt:P1416 ?unit.
          ?woman wdt:P21 wd:Q6581072.
          ?work wdt:P50 ?woman.
          }
        '''},
    ]
    #print(json.dumps(all_vu_query_list, indent=2))

    # Retrieve data from Wikidata Query Service
    endpoint_url = 'https://query.wikidata.org/sparql'

    fieldnames = ['date']
    today = generate_utc_date()
    row_dict = {'date': today}

    for query_dict in all_vu_query_list:
        query_name = query_dict['name']
        print(query_name)
        fieldnames.append(query_name)
        tries = 0
        success = False

        # try to acquire the data for an hour
        while (success == False) and (tries < 12):
            try:
                count = get_single_value(query_dict['query'], endpoint_url)
                success = True
                row_dict[query_name] = count
            except:
                tries += 1
                sleep(300) # wait 5 minutes and try again
    if success:
        table.append(row_dict)

    return write_dicts_to_string(table, fieldnames)

def get_unit_affiliation_queries():
    units_query_list = [
        {'name': 'units_total',
        'query': '''
        select ?unit (count(distinct ?person) as ?count)  where {
          ?unit wdt:P749+ wd:Q29052.
          ?person wdt:P1416 ?unit.
          }
        group by ?unit
        '''},
        {'name': 'units_women',
        'query': '''
        select ?unit (count(distinct ?woman) as ?count)  where {
          ?unit wdt:P749+ wd:Q29052.
          ?woman wdt:P1416 ?unit.
          ?woman wdt:P21 wd:Q6581072.
          }
        group by ?unit
        '''},
        {'name': 'units_men',
        'query': '''
        select ?unit (count(distinct ?man) as ?count)  where {
          ?unit wdt:P749+ wd:Q29052.
          ?man wdt:P1416 ?unit.
          ?man wdt:P21 wd:Q6581097.
          }
        group by ?unit
        '''},
        {'name': 'units_orcid',
        'query': '''
        select ?unit (count(distinct ?person) as ?count)  where {
          ?unit wdt:P749+ wd:Q29052.
          ?person wdt:P1416 ?unit.
          ?person wdt:P496 ?orcid.
          }
        group by ?unit
        '''},
        {'name': 'units_works',
        'query': '''
        select ?unit (count(distinct ?work) as ?count)  where {
          ?unit wdt:P749+ wd:Q29052.
          ?person wdt:P1416 ?unit.
          ?work wdt:P50 ?person.
          }
        group by ?unit
        '''},
        {'name': 'units_works_men',
        'query': '''
        select ?unit (count(distinct ?work) as ?count)  where {
          ?unit wdt:P749+ wd:Q29052.
          ?man wdt:P1416 ?unit.
          ?man wdt:P21 wd:Q6581097.
          ?work wdt:P50 ?man.
          }
        group by ?unit
        '''},
        {'name': 'units_works_women',
        'query': '''
        select ?unit (count(distinct ?work) as ?count)  where {
          ?unit wdt:P749+ wd:Q29052.
          ?woman wdt:P1416 ?unit.
          ?woman wdt:P21 wd:Q6581072.
          ?work wdt:P50 ?woman.
          }
        group by ?unit
        '''}
    ]
    return units_query_list

# This retrieves counts by unit for a particular query type, then appends the results
# for all of the units as a new row in the table.
# NOTE: unlike the other functions, the table here is a list of lists, not list of dicts.
# The first column must be the date.
def add_query_to_unit_table(table, query):
    date = generate_utc_date()
    tries = 0
    success = False

    # try to acquire the data for an hour
    while (success == False) and (tries < 12):
        try:
            dictionary = get_unit_counts(query)
            success = True
            row_list = [date]
            for header in table[0][1:len(table[0])]: # skip the first item (date)
                found = False
                for count in dictionary:
                    if count['unit'] == header:
                        found = True
                        row_list.append(count['count'])
                if not found:
                    row_list.append('0')
        except:
            tries += 1
            sleep(300) # wait 5 minutes and try again
    if success:
        table.append(row_list)
    return write_lists_to_string(table)


In [None]:
while True: # infinite loop
    print('Time checked:', datetime.datetime.utcnow().isoformat())

    date_last_run = read_string_from_github_file(organization_name, repo_name, path_to_directory, 'last_run.txt')
    print('Date last run:', date_last_run)

    date_now_utc = generate_utc_date()
    print('UTC date now is:', date_now_utc)

    if date_now_utc > date_last_run:
        # log into the GitHub API and create a repo instance
        repo = login_get_repo(repo_name, github_username, organization_name, organization_is_user, cred_directory)

        # Record today's Wikidata Vanderbilt-wide counts for items
        print('Item counts (university-wide):')

        # Retrieve old copy of data from GitHub
        filename = 'vandycite_item_data.csv'
        table = read_dicts_from_github_csv(organization_name, repo_name, path_to_directory, filename)
        # Query the Wikidata Query Service to get today's data
        rawCsvText = get_vu_counts(table)
        # Write edited data back to GitHub via its API
        response = update_file(organization_name, repo_name, path_to_directory, filename, rawCsvText)
        print(response)
        print()

        # Record today's Wikidata item counts by Vanderbilt subsidiary unit
        print('Item counts by unit:')
        queries = get_unit_affiliation_queries()

        for query_dict in queries:
            print(query_dict['name'])
            filename = query_dict['name'] + '.csv'
            table = read_lists_from_github_csv(organization_name, repo_name, path_to_directory, filename)
            rawCsvText = add_query_to_unit_table(table, query_dict['query'])
            #print(rawCsvText)
            response = update_file(organization_name, repo_name, path_to_directory, filename, rawCsvText)
            print(response)
        print()

        # Record today's counts of contrubutions to Wikidata by VandyCite members
        print('Contributions')
        filename = 'vandycite_edit_data.csv'
        table = read_dicts_from_github_csv(organization_name, repo_name, path_to_directory, filename)
        rawCsvText = get_vandycite_contribution_counts(organization_name, repo_name, path_to_directory, table)
        response = update_file(organization_name, repo_name, path_to_directory, filename, rawCsvText)
        print(response)

        # Update the date last run
        response = update_file(organization_name, repo_name, path_to_directory, 'last_run.txt', generate_utc_date() )
        print('done')
    print()
    # wait an hour before checking again
    sleep(3600)

Time checked: 2020-07-23T01:20:52.292060
Date last run: 2020-07-23
UTC date now is: 2020-07-23

