In [29]:
# import, configuration, functions, etc.

import json
import requests
from time import sleep
import csv
import io
import datetime
from pathlib import Path
from github import Github

# the access token should be generated for read/write access to public repos
# see https://developer.github.com/v3/auth/#working-with-two-factor-authentication
# see https://github.com/settings/tokens/new
# select public_repo

# reference on PyGithub: https://pygithub.readthedocs.io/en/latest/github_objects/Repository.html
# reference on GitHub API: https://developer.github.com/v3/guides/getting-started/

sparql_sleep = 0.1 # number of seconds to wait between queries to SPARQL endpoint
accept_media_type = 'application/json'

githubUsername = ''  # set to empty string if using a token (for 2FA)
organizationName = 'baskaufs'
organization_is_user = True
repoName = 'test'
credDirectory = 'home' # set to 'home' if the credential is in the home directory, otherwise working directory
pathToDirectory = ''

# -----------------
# utility functions
# -----------------

# NOTE: change the user_agent_header string to something appropriate for your project
def generate_header_dictionary(accept_media_type):
    user_agent_header = 'VanderDataBot/0.1 (https://github.com/HeardLibrary/linked-data/tree/master/publications/data; mailto:steve.baskauf@vanderbilt.edu)'
    request_header_dictionary = {
        'Accept' : accept_media_type,
        'User-Agent': user_agent_header
    }
    return request_header_dictionary

def generate_utc_date():
    whole_time_string_z = datetime.datetime.utcnow().isoformat() # form: 2019-12-05T15:35:04.959311
    date_z = whole_time_string_z.split('T')[0] # form 2019-12-05
    return date_z

# read from a CSV file into a list of dictionaries (representing a table)
def read_dicts_from_csv(filename):
    with open(filename, 'r', newline='', encoding='utf-8') as file_object:
        dict_object = csv.DictReader(file_object)
        table = []
        for row in dict_object:
            table.append(row)
    return table

# read from a CSV file in GitHub into a list of dictionaries (representing a table)
def read_dicts_from_github_csv(organizationName, repoName, pathToDirectory, filename):
    path = pathToDirectory + filename
    r = requests.get('https://raw.githubusercontent.com/' + organizationName + '/' + repoName + '/master/' + path)
    file_text = r.text.split('\n')
    file_rows = csv.DictReader(file_text)
    table = []
    for row in file_rows:
        table.append(row)
    return table

# write a list of dictionaries to a CSV file
def write_dicts_to_csv(table, filename, fieldnames):
    with open(filename, 'w', newline='', encoding='utf-8') as csv_file_object:
        writer = csv.DictWriter(csv_file_object, fieldnames=fieldnames)
        writer.writeheader()
        for row in table:
            writer.writerow(row)

# write a list of dictionaries to a CSV file
def write_dicts_to_string(table, fieldnames):
    output = io.StringIO()
    writer = csv.DictWriter(output, fieldnames=fieldnames)
    writer.writeheader()
    for row in table:
        writer.writerow(row)
    return output.getvalue()
# -----------------
# functions for interacting with APIs
# -----------------

# This function sends a query to a SPARQL endpoint and returns a single value.
# For the Wikidata SPARQL endpoint, it extracts "single_value" from the query.
def get_single_value(query, endpoint_url):
    r = requests.get(endpoint_url, params={'query' : query}, headers = generate_header_dictionary(accept_media_type))
    try:
        data = r.json()
        #print(json.dumps(data, indent=2))
        
        # Extract value from response JSON depending on the API type
        if endpoint_url == 'https://query.wikidata.org/sparql':
            value = data['results']['bindings'][0]['single_value']['value']
    except:
        value = [r.text]
    # delay to avoid hitting the SPARQL endpoint to rapidly
    sleep(sparql_sleep)
    return value

# This function sends a query to the XTools Edit Counter and returns a single value.
def get_xtools_edit_counts(username, project, namespace):
    query_url = 'https://xtools.wmflabs.org/api/user/simple_editcount/' + project + '/' + username + '/' + namespace
    #query_url = 'https://xtools.wmflabs.org/api/user/simple_editcount/' + project + '/' + username + '/' + namespace + '/' + start_date + '/' + end_date
    r = requests.get(query_url, headers = generate_header_dictionary(accept_media_type))
    try:
        data = r.json()
        #print(json.dumps(data, indent=2))
        
        value = data['live_edit_count']
    except:
        value = [r.text]
    # delay to avoid hitting the API to rapidly
    sleep(sparql_sleep)
    return value

# -----------------
# functions for interacting with GitHub
# -----------------

# value of directory should be either 'home' or 'working'
def loadCredential(filename, directory):
    cred = ''
    # to change the script to look for the credential in the working directory, change the value of home to empty string
    if directory == 'home':
        home = str(Path.home()) #gets path to home directory; supposed to work for Win and Mac
        credentialPath = home + '/' + filename
    else:
        directory = 'working'
        credentialPath = filename
    try:
        with open(credentialPath, 'rt', encoding='utf-8') as fileObject:
            cred = fileObject.read()
    except:
        print(filename + ' file not found - is it in your ' + directory + ' directory?')
        exit()
    return(cred)

# pass in an empty string for organizationName to use an individual account
# pass in an empty string for githubUsername to use a token instead of username login
def loginGetRepo(repoName, githubUsername, organizationName, organization_is_user, credDirectory):
    if githubUsername == '':
        token = loadCredential('test_token.txt', credDirectory)
        g = Github(login_or_token = token)
    else:
        pwd = loadCredential('pwd.txt', credDirectory)
        g = Github(githubUsername, pwd)
    
    if organization_is_user:
        # this option accesses a user's repo instead of an organizational one
        # In this case, the value of organizationName is not used.
        user = g.get_user()
        repo = user.get_repo(repoName)
    else:
        # this option creates an instance of a repo in an organization
        # to which the token creator has push access
        organization = g.get_organization(organizationName)
        repo = organization.get_repo(repoName)
    return(repo)

def getUserList(repo):
    personList = []
    people = repo.get_collaborators()
    for person in people:
        personList.append(person.login)
    return personList

def getFileSha(account, repo, filePath):
    # get the data about the file to get its blob SHA
    r = requests.get('https://api.github.com/repos/' + account + '/' + repo + '/contents/' + filePath)
    fileData = r.json()
    try:
        sha = fileData['sha']
    except:
        # if the file doesn't already exist on GitHub, no sha will be returned
        sha = ''
    return sha

# use this function to update an existing text file
def updateFile(account, repoName, pathToDirectory, filename, content):
    path = pathToDirectory + filename
    commitMessage = 'Update ' + filename + ' file via API'
    sha = getFileSha(account, repoName, path)
    if sha == '':
        response = repo.create_file(path, commitMessage, content)
    else:
        response = repo.update_file(path, commitMessage, content, sha)
    return response

# -----------------
# top-level functions for acquiring the main datasets
# -----------------

# Retrieves the total contributions for all of the participants in the VandyCite project
# If it fails due to timeout or some other error, the table remains unchanged
# Returns a raw CSV string
def get_vandycite_contribution_counts(table):
    # Get username list
    vandycite_user_list = []
    user_dicts = read_dicts_from_csv('vandycite_users.csv')
    for dict in user_dicts:
        vandycite_user_list.append(dict['username'])

    # Retrieve data from XTools Edit Counter API
    project = 'wikidata'
    namespace = '0' # 0 is the main namespace

    fieldnames = ['date'] + vandycite_user_list
    today = generate_utc_date()
    row_dict = {'date': today}

    for username in vandycite_user_list:
        print(username)
        tries = 0
        success = False
        # try to acquire the data for an hour
        while (success == False) and (tries < 12):
            try:
                count = get_xtools_edit_counts(username, project, namespace)
                success = True
                row_dict[username] = count
            except:
                tries += 1
                sleep(300) # wait 5 minutes and try again
    if success:
        table.append(row_dict)

    return write_dicts_to_string(table, fieldnames)

# Runs all of the queries that retrieve a single value for the whole university
# If it fails due to timeout or some other error, the table remains unchanged
# Returns a raw CSV string
def get_vu_counts(table):
    all_vu_query_list = [
        {'name': 'vu_total',
        'query': '''
        select (count(distinct ?person) as ?single_value)  where {
          ?unit wdt:P749+ wd:Q29052.
          ?person wdt:P1416 ?unit.
          }
        '''},
        {'name': 'vu_men',
        'query': '''
        select (count(distinct ?man) as ?single_value)  where {
          ?unit wdt:P749+ wd:Q29052.
          ?man wdt:P1416 ?unit.
          ?man wdt:P21 wd:Q6581097.
          }
        '''},
        {'name': 'vu_women',
        'query': '''
        select (count(distinct ?woman) as ?single_value)  where {
          ?unit wdt:P749+ wd:Q29052.
          ?woman wdt:P1416 ?unit.
          ?woman wdt:P21 wd:Q6581072.
          }
        '''},
        {'name': 'vu_orcid',
        'query': '''
        select (count(distinct ?person) as ?single_value)  where {
          ?unit wdt:P749+ wd:Q29052.
          ?person wdt:P1416 ?unit.
          ?person wdt:P496 ?orcid.
          }
        '''},
        {'name': 'vu_works',
        'query': '''
        select (count(distinct ?work) as ?single_value)  where {
          ?unit wdt:P749+ wd:Q29052.
          ?person wdt:P1416 ?unit.
          ?work wdt:P50 ?person.
          }
        '''},
        {'name': 'vu_men_works',
        'query': '''
        select (count(distinct ?work) as ?single_value)  where {
          ?unit wdt:P749+ wd:Q29052.
          ?man wdt:P1416 ?unit.
          ?man wdt:P21 wd:Q6581097.
          ?work wdt:P50 ?man.
          }
        '''},
        {'name': 'vu_women_works',
        'query': '''
        select (count(distinct ?work) as ?single_value)  where {
          ?unit wdt:P749+ wd:Q29052.
          ?woman wdt:P1416 ?unit.
          ?woman wdt:P21 wd:Q6581072.
          ?work wdt:P50 ?woman.
          }
        '''},
    ]
    #print(json.dumps(all_vu_query_list, indent=2))

    # Retrieve data from Wikidata Query Service
    endpoint_url = 'https://query.wikidata.org/sparql'

    fieldnames = ['date']
    today = generate_utc_date()
    row_dict = {'date': today}

    for query_dict in all_vu_query_list:
        query_name = query_dict['name']
        print(query_name)
        fieldnames.append(query_name)
        tries = 0
        success = False

        # try to acquire the data for an hour
        while (success == False) and (tries < 12):
            try:
                count = get_single_value(query_dict['query'], endpoint_url)
                success = True
                row_dict[query_name] = count
            except:
                tries += 1
                sleep(300) # wait 5 minutes and try again
    if success:
        table.append(row_dict)

    return write_dicts_to_string(table, fieldnames)

In [30]:
print('Time checked:', datetime.datetime.utcnow().isoformat())
# log into the GitHub API and create a repo instance
repo = loginGetRepo(repoName, githubUsername, organizationName, organization_is_user, credDirectory)

# Record today's Wikidata Vanderbilt-wide counts for items

# Retrieve old copy of data from GitHub
filename = 'vandycite_item_data.csv'
table = read_dicts_from_github_csv(organizationName, repoName, pathToDirectory, filename)
# Query the Wikidata Query Service to get today's data
rawCsvText = get_vu_counts(table)
# Write edited data back to GitHub via its API
response = updateFile(organizationName, repoName, pathToDirectory, filename, rawCsvText)
print('Item counts:', response)
print()

# Record today's counts of contrubutions to Wikidata by VandyCite members
filename = 'vandycite_edit_data.csv'
table = read_dicts_from_github_csv(organizationName, repoName, pathToDirectory, filename)
rawCsvText = get_vandycite_contribution_counts(table)
response = updateFile(organizationName, repoName, pathToDirectory, filename, rawCsvText)
print('Contributions', response)
print()

Time checked: 2020-07-20T20:33:23.570305
vu_total
vu_men
vu_women
vu_orcid
vu_works
vu_men_works
vu_women_works
Item counts: {'commit': Commit(sha="755b7c874f514819e24b8e7a2de5e306987c15b1"), 'content': ContentFile(path="vandycite_item_data.csv")}
Clifford_Anderson
Baskaufs
Fmlester
Ramonavromero
Talinum
Celiaswalker
CatonMA2
Gridersd
JeffBTaylor
Marjans74
Charlotte_Y._Lew
KukanaLuika
VanderBot
Contributions {'commit': Commit(sha="abd311319da0075114dec06b8c37dbbd8bd478ad"), 'content': ContentFile(path="vandycite_edit_data.csv")}
